Renaming Files from Preservation for Technology Services¶
Occasionally, we will get files from Special Collections or Preservation with very long filenames. Even though we bundle these with SAFCreator before we send to technology services to put online with MAGPIE, the filenames may be long and they will say they can’t open the path. I think this is some Windows thing that I know nothing about. To fix, before you write your SAF, you can copy your files to a flatter structure using something like this:
import csv
import shutil
import os
import sys
from pathlib import Path
def sanitize_filename(filename):
"""Keep only the base filename from a path."""
return os.path.basename(filename)
def copy_files_and_update_csv(input_csv, output_csv, destination_dir, source_base_dir=None):
"""
Copy files from CSV to destination and create updated CSV.
Args:
input_csv: Path to input CSV file
output_csv: Path to output CSV file
destination_dir: Base destination directory for copied files
source_base_dir: Optional base directory for source files (if paths are relative)
"""
# Create destination directory
dest_path = Path(destination_dir)
dest_path.mkdir(parents=True, exist_ok=True)
# Track statistics
stats = {
'rows_processed': 0,
'files_copied': 0,
'files_missing': 0,
'errors': []
}
# Read input CSV and process
with open(input_csv, 'r', encoding='utf-8') as infile, \
open(output_csv, 'w', encoding='utf-8', newline='') as outfile:
reader = csv.DictReader(infile)
# Verify 'filename' column exists
if 'filename' not in reader.fieldnames:
raise ValueError("CSV must contain a 'filename' column")
writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
writer.writeheader()
for row_num, row in enumerate(reader, start=1):
stats['rows_processed'] += 1
# Get filenames (may be multiple, separated by ' | ')
filename_field = row['filename']
filenames = [f.strip() for f in filename_field.split('||')]
# Create row directory (e.g., row_001, row_002, etc.)
row_dir = dest_path / f"row_{row_num:03d}"
row_dir.mkdir(exist_ok=True)
# Copy each file and collect new paths
new_filenames = []
for file_path in filenames:
# Determine source path
if source_base_dir:
source_file = Path(source_base_dir) / file_path
else:
source_file = Path(file_path)
# Get the base filename
base_name = source_file.name
# Destination path
dest_file = row_dir / base_name
# Copy file
try:
if source_file.exists():
shutil.copy2(source_file, dest_file)
stats['files_copied'] += 1
print(f"Copied: {source_file} -> {dest_file}")
else:
stats['files_missing'] += 1
error_msg = f"Row {row_num}: File not found: {source_file}"
stats['errors'].append(error_msg)
print(f"WARNING: {error_msg}")
# Still add to new_filenames to preserve structure
except Exception as e:
error_msg = f"Row {row_num}: Error copying {source_file}: {str(e)}"
stats['errors'].append(error_msg)
print(f"ERROR: {error_msg}")
# Build new relative path
new_path = f"{row_dir.name}/{base_name}"
new_filenames.append(new_path)
# Update the filename field with new paths
row['filename'] = '||'.join(new_filenames)
# Write updated row to output CSV
writer.writerow(row)
return stats
def main():
"""Main entry point for the script."""
import argparse
parser = argparse.ArgumentParser(
description='Reorganize files from CSV into sequential directories'
)
parser.add_argument('-i', '--input_csv', help='Input CSV file')
parser.add_argument('-d', '--destination_dir', help='Destination directory for copied files')
parser.add_argument(
'-o', '--output-csv',
help='Output CSV file (default: <input>_updated.csv)'
)
parser.add_argument(
'-s', '--source-base',
help='Base directory for source files (if CSV paths are relative)'
)
args = parser.parse_args()
# Set default output CSV name if not provided
if not args.output_csv:
input_path = Path(args.input_csv)
args.output_csv = input_path.stem + '_updated.csv'
print(f"Input CSV: {args.input_csv}")
print(f"Output CSV: {args.output_csv}")
print(f"Destination: {args.destination_dir}")
if args.source_base:
print(f"Source base: {args.source_base}")
print("-" * 60)
# Process files
try:
stats = copy_files_and_update_csv(
args.input_csv,
args.output_csv,
args.destination_dir,
args.source_base
)
# Print summary
print("-" * 60)
print("SUMMARY:")
print(f" Rows processed: {stats['rows_processed']}")
print(f" Files copied: {stats['files_copied']}")
print(f" Files missing: {stats['files_missing']}")
print(f" Errors: {len(stats['errors'])}")
if stats['errors']:
print("\nErrors encountered:")
for error in stats['errors'][:10]: # Show first 10 errors
print(f" - {error}")
if len(stats['errors']) > 10:
print(f" ... and {len(stats['errors']) - 10} more errors")
print(f"\nUpdated CSV written to: {args.output_csv}")
except Exception as e:
print(f"FATAL ERROR: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()
This requires an input csv with a filename column with each file separated like ||. Because you might run this from anywhere,
I also suggest that the CSV have the full static path to each file – not a relative path. Finally give it an output csv and a directory it
can write to. Your full command will look like this:
python reorganize_csv_files.py -i mcinnis.csv -d /Volumes/digital_project_management/mark_playground/mcinnis_new -o mcinnis_new.csv