======================================================== Renaming Files from Preservation for Technology Services ======================================================== Occasionally, we will get files from Special Collections or Preservation with very long filenames. Even though we bundle these with SAFCreator before we send to technology services to put online with MAGPIE, the filenames may be long and they will say they can't open the path. I think this is some Windows thing that I know nothing about. To fix, before you write your SAF, you can copy your files to a flatter structure using something like this: .. code-block:: python import csv import shutil import os import sys from pathlib import Path def sanitize_filename(filename): """Keep only the base filename from a path.""" return os.path.basename(filename) def copy_files_and_update_csv(input_csv, output_csv, destination_dir, source_base_dir=None): """ Copy files from CSV to destination and create updated CSV. Args: input_csv: Path to input CSV file output_csv: Path to output CSV file destination_dir: Base destination directory for copied files source_base_dir: Optional base directory for source files (if paths are relative) """ # Create destination directory dest_path = Path(destination_dir) dest_path.mkdir(parents=True, exist_ok=True) # Track statistics stats = { 'rows_processed': 0, 'files_copied': 0, 'files_missing': 0, 'errors': [] } # Read input CSV and process with open(input_csv, 'r', encoding='utf-8') as infile, \ open(output_csv, 'w', encoding='utf-8', newline='') as outfile: reader = csv.DictReader(infile) # Verify 'filename' column exists if 'filename' not in reader.fieldnames: raise ValueError("CSV must contain a 'filename' column") writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames) writer.writeheader() for row_num, row in enumerate(reader, start=1): stats['rows_processed'] += 1 # Get filenames (may be multiple, separated by ' | ') filename_field = row['filename'] filenames = [f.strip() for f in filename_field.split('||')] # Create row directory (e.g., row_001, row_002, etc.) row_dir = dest_path / f"row_{row_num:03d}" row_dir.mkdir(exist_ok=True) # Copy each file and collect new paths new_filenames = [] for file_path in filenames: # Determine source path if source_base_dir: source_file = Path(source_base_dir) / file_path else: source_file = Path(file_path) # Get the base filename base_name = source_file.name # Destination path dest_file = row_dir / base_name # Copy file try: if source_file.exists(): shutil.copy2(source_file, dest_file) stats['files_copied'] += 1 print(f"Copied: {source_file} -> {dest_file}") else: stats['files_missing'] += 1 error_msg = f"Row {row_num}: File not found: {source_file}" stats['errors'].append(error_msg) print(f"WARNING: {error_msg}") # Still add to new_filenames to preserve structure except Exception as e: error_msg = f"Row {row_num}: Error copying {source_file}: {str(e)}" stats['errors'].append(error_msg) print(f"ERROR: {error_msg}") # Build new relative path new_path = f"{row_dir.name}/{base_name}" new_filenames.append(new_path) # Update the filename field with new paths row['filename'] = '||'.join(new_filenames) # Write updated row to output CSV writer.writerow(row) return stats def main(): """Main entry point for the script.""" import argparse parser = argparse.ArgumentParser( description='Reorganize files from CSV into sequential directories' ) parser.add_argument('-i', '--input_csv', help='Input CSV file') parser.add_argument('-d', '--destination_dir', help='Destination directory for copied files') parser.add_argument( '-o', '--output-csv', help='Output CSV file (default: _updated.csv)' ) parser.add_argument( '-s', '--source-base', help='Base directory for source files (if CSV paths are relative)' ) args = parser.parse_args() # Set default output CSV name if not provided if not args.output_csv: input_path = Path(args.input_csv) args.output_csv = input_path.stem + '_updated.csv' print(f"Input CSV: {args.input_csv}") print(f"Output CSV: {args.output_csv}") print(f"Destination: {args.destination_dir}") if args.source_base: print(f"Source base: {args.source_base}") print("-" * 60) # Process files try: stats = copy_files_and_update_csv( args.input_csv, args.output_csv, args.destination_dir, args.source_base ) # Print summary print("-" * 60) print("SUMMARY:") print(f" Rows processed: {stats['rows_processed']}") print(f" Files copied: {stats['files_copied']}") print(f" Files missing: {stats['files_missing']}") print(f" Errors: {len(stats['errors'])}") if stats['errors']: print("\nErrors encountered:") for error in stats['errors'][:10]: # Show first 10 errors print(f" - {error}") if len(stats['errors']) > 10: print(f" ... and {len(stats['errors']) - 10} more errors") print(f"\nUpdated CSV written to: {args.output_csv}") except Exception as e: print(f"FATAL ERROR: {e}", file=sys.stderr) sys.exit(1) if __name__ == '__main__': main() This requires an input csv with a filename column with each file separated like :code:`||`. Because you might run this from anywhere, I also suggest that the CSV have the full static path to each file -- not a relative path. Finally give it an output csv and a directory it can write to. Your full command will look like this: .. code-block:: shell python reorganize_csv_files.py -i mcinnis.csv -d /Volumes/digital_project_management/mark_playground/mcinnis_new -o mcinnis_new.csv