Migrating from a directory of jsons¶
This is how to take a directory of IIIF manifests and extract metadata and derivatives. This method works with both v2 and v3 IIIF manifests.
Use this derivative and metadata extraction script¶
import json
import csv
import os
from collections import defaultdict
SUFFIX = "/full/full/0/default.jpg"
# Canonical CSV fields
METADATA_FIELDS = [
"dc.source",
"creator",
"contributor",
"subject",
"rights",
"format",
"language",
"title",
"type",
"extent",
"created",
"abstract",
"spatial",
"Description",
"Created On",
"In Scope",
]
# Normalized lookup map (lowercase → canonical)
FIELD_LOOKUP = {
"dc.source": "dc.source",
"creator": "creator",
"contributor": "contributor",
"subject": "subject",
"rights": "rights",
"format": "format",
"language": "language",
"title": "title",
"type": "type",
"extent": "extent",
"created": "created",
"date": "created", # common IIIF variant
"abstract": "abstract",
"description": "Description",
"spatial": "spatial",
"created on": "Created On",
"in scope": "In Scope",
}
def find_matching_strings(data, matches=None):
if matches is None:
matches = []
if isinstance(data, dict):
for value in data.values():
find_matching_strings(value, matches)
elif isinstance(data, list):
for item in data:
find_matching_strings(item, matches)
elif isinstance(data, str):
if data.endswith(SUFFIX):
matches.append(data)
return matches
def extract_langmap_text(obj):
"""
Extract text from IIIF language maps:
{ "none": ["text"] }, { "en": ["text"] }, etc.
"""
if not isinstance(obj, dict):
return []
values = []
for v in obj.values():
if isinstance(v, list):
values.extend(v)
else:
values.append(v)
return [str(v).strip() for v in values if v]
def normalize_label(label_raw):
"""
Normalize label to canonical METADATA_FIELDS entry.
"""
if isinstance(label_raw, str):
label = label_raw.strip()
elif isinstance(label_raw, dict):
texts = extract_langmap_text(label_raw)
label = texts[0] if texts else ""
else:
return None
key = label.lower().strip()
return FIELD_LOOKUP.get(key)
def normalize_metadata(metadata):
"""
Normalize IIIF metadata into:
{ canonical_label: [value1, value2, ...] }
"""
normalized = defaultdict(list)
if not isinstance(metadata, list):
return normalized
for entry in metadata:
if not isinstance(entry, dict):
continue
canonical_label = normalize_label(entry.get("label"))
if not canonical_label:
continue
value_raw = entry.get("value")
if isinstance(value_raw, list):
values = [str(v).strip() for v in value_raw]
elif isinstance(value_raw, dict):
values = extract_langmap_text(value_raw)
elif value_raw is not None:
values = [str(value_raw).strip()]
else:
values = []
for v in values:
normalized[canonical_label].append(v)
return normalized
def process_directory(json_dir, output_csv):
rows = []
max_counts = defaultdict(int)
for filename in sorted(os.listdir(json_dir)):
if not filename.lower().endswith(".json"):
continue
json_path = os.path.join(json_dir, filename)
try:
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception:
continue
label_raw = data.get("label", "")
label_value = ""
if isinstance(label_raw, str):
label_value = label_raw.strip()
elif isinstance(label_raw, dict):
texts = extract_langmap_text(label_raw)
label_value = texts[0] if texts else ""
row = {
"filename": filename,
"label": label_value
}
metadata = normalize_metadata(data.get("metadata", []))
for field in METADATA_FIELDS:
values = metadata.get(field, [])
for i, value in enumerate(values, start=1):
col_name = f"{field}_{i}"
row[col_name] = value
max_counts[field] = max(max_counts[field], i)
derivatives = find_matching_strings(data)
row["derivatives"] = ";".join(derivatives)
rows.append(row)
# Build CSV header
header = ["filename", "label"]
for field in METADATA_FIELDS:
for i in range(1, max_counts[field] + 1):
header.append(f"{field}_{i}")
header.append("derivatives")
with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
writer.writerow(header)
for row in rows:
writer.writerow([row.get(col, "") for col in header])
if __name__ == "__main__":
process_directory("path/to/jsons", "output.csv")
Intermediate steps¶
Open the output spreadsheet in Google Sheets. You will need to make it follow the format of the spreadsheet described here.
Read this page if you wish to add linked data before uploading to Archipelago.