Renaming Files from Preservation for Technology Services¶

Occasionally, we will get files from Special Collections or Preservation with very long filenames. Even though we bundle these with SAFCreator before we send to technology services to put online with MAGPIE, the filenames may be long and they will say they can’t open the path. I think this is some Windows thing that I know nothing about. To fix, before you write your SAF, you can copy your files to a flatter structure using something like this:

import csv
import shutil
import os
import sys
from pathlib import Path


def sanitize_filename(filename):
    """Keep only the base filename from a path."""
    return os.path.basename(filename)


def copy_files_and_update_csv(input_csv, output_csv, destination_dir, source_base_dir=None):
    """
    Copy files from CSV to destination and create updated CSV.

    Args:
        input_csv: Path to input CSV file
        output_csv: Path to output CSV file
        destination_dir: Base destination directory for copied files
        source_base_dir: Optional base directory for source files (if paths are relative)
    """
    # Create destination directory
    dest_path = Path(destination_dir)
    dest_path.mkdir(parents=True, exist_ok=True)

    # Track statistics
    stats = {
        'rows_processed': 0,
        'files_copied': 0,
        'files_missing': 0,
        'errors': []
    }

    # Read input CSV and process
    with open(input_csv, 'r', encoding='utf-8') as infile, \
         open(output_csv, 'w', encoding='utf-8', newline='') as outfile:

        reader = csv.DictReader(infile)

        # Verify 'filename' column exists
        if 'filename' not in reader.fieldnames:
            raise ValueError("CSV must contain a 'filename' column")

        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()

        for row_num, row in enumerate(reader, start=1):
            stats['rows_processed'] += 1

            # Get filenames (may be multiple, separated by ' | ')
            filename_field = row['filename']
            filenames = [f.strip() for f in filename_field.split('||')]

            # Create row directory (e.g., row_001, row_002, etc.)
            row_dir = dest_path / f"row_{row_num:03d}"
            row_dir.mkdir(exist_ok=True)

            # Copy each file and collect new paths
            new_filenames = []

            for file_path in filenames:
                # Determine source path
                if source_base_dir:
                    source_file = Path(source_base_dir) / file_path
                else:
                    source_file = Path(file_path)

                # Get the base filename
                base_name = source_file.name

                # Destination path
                dest_file = row_dir / base_name

                # Copy file
                try:
                    if source_file.exists():
                        shutil.copy2(source_file, dest_file)
                        stats['files_copied'] += 1
                        print(f"Copied: {source_file} -> {dest_file}")
                    else:
                        stats['files_missing'] += 1
                        error_msg = f"Row {row_num}: File not found: {source_file}"
                        stats['errors'].append(error_msg)
                        print(f"WARNING: {error_msg}")
                        # Still add to new_filenames to preserve structure

                except Exception as e:
                    error_msg = f"Row {row_num}: Error copying {source_file}: {str(e)}"
                    stats['errors'].append(error_msg)
                    print(f"ERROR: {error_msg}")

                # Build new relative path
                new_path = f"{row_dir.name}/{base_name}"
                new_filenames.append(new_path)

            # Update the filename field with new paths
            row['filename'] = '||'.join(new_filenames)

            # Write updated row to output CSV
            writer.writerow(row)

    return stats


def main():
    """Main entry point for the script."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Reorganize files from CSV into sequential directories'
    )
    parser.add_argument('-i', '--input_csv', help='Input CSV file')
    parser.add_argument('-d', '--destination_dir', help='Destination directory for copied files')
    parser.add_argument(
        '-o', '--output-csv',
        help='Output CSV file (default: <input>_updated.csv)'
    )
    parser.add_argument(
        '-s', '--source-base',
        help='Base directory for source files (if CSV paths are relative)'
    )

    args = parser.parse_args()

    # Set default output CSV name if not provided
    if not args.output_csv:
        input_path = Path(args.input_csv)
        args.output_csv = input_path.stem + '_updated.csv'

    print(f"Input CSV: {args.input_csv}")
    print(f"Output CSV: {args.output_csv}")
    print(f"Destination: {args.destination_dir}")
    if args.source_base:
        print(f"Source base: {args.source_base}")
    print("-" * 60)

    # Process files
    try:
        stats = copy_files_and_update_csv(
            args.input_csv,
            args.output_csv,
            args.destination_dir,
            args.source_base
        )

        # Print summary
        print("-" * 60)
        print("SUMMARY:")
        print(f"  Rows processed: {stats['rows_processed']}")
        print(f"  Files copied: {stats['files_copied']}")
        print(f"  Files missing: {stats['files_missing']}")
        print(f"  Errors: {len(stats['errors'])}")

        if stats['errors']:
            print("\nErrors encountered:")
            for error in stats['errors'][:10]:  # Show first 10 errors
                print(f"  - {error}")
            if len(stats['errors']) > 10:
                print(f"  ... and {len(stats['errors']) - 10} more errors")

        print(f"\nUpdated CSV written to: {args.output_csv}")

    except Exception as e:
        print(f"FATAL ERROR: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == '__main__':
    main()

This requires an input csv with a filename column with each file separated like ||. Because you might run this from anywhere, I also suggest that the CSV have the full static path to each file – not a relative path. Finally give it an output csv and a directory it can write to. Your full command will look like this:

python reorganize_csv_files.py -i mcinnis.csv -d /Volumes/digital_project_management/mark_playground/mcinnis_new -o mcinnis_new.csv