Migrating from a directory of jsons¶

This is how to take a directory of IIIF manifests and extract metadata and derivatives. This method works with both v2 and v3 IIIF manifests.
Use this derivative and metadata extraction script¶

import json
import csv
import os
from collections import defaultdict

SUFFIX = "/full/full/0/default.jpg"

# Canonical CSV fields
METADATA_FIELDS = [
    "dc.source",
    "creator",
    "contributor",
    "subject",
    "rights",
    "format",
    "language",
    "title",
    "type",
    "extent",
    "created",
    "abstract",
    "spatial",
    "Description",
    "Created On",
    "In Scope",
]

# Normalized lookup map (lowercase → canonical)
FIELD_LOOKUP = {
    "dc.source": "dc.source",
    "creator": "creator",
    "contributor": "contributor",
    "subject": "subject",
    "rights": "rights",
    "format": "format",
    "language": "language",
    "title": "title",
    "type": "type",
    "extent": "extent",
    "created": "created",
    "date": "created",          # common IIIF variant
    "abstract": "abstract",
    "description": "Description",
    "spatial": "spatial",
    "created on": "Created On",
    "in scope": "In Scope",
}


def find_matching_strings(data, matches=None):
    if matches is None:
        matches = []

    if isinstance(data, dict):
        for value in data.values():
            find_matching_strings(value, matches)
    elif isinstance(data, list):
        for item in data:
            find_matching_strings(item, matches)
    elif isinstance(data, str):
        if data.endswith(SUFFIX):
            matches.append(data)

    return matches


def extract_langmap_text(obj):
    """
    Extract text from IIIF language maps:
    { "none": ["text"] }, { "en": ["text"] }, etc.
    """
    if not isinstance(obj, dict):
        return []

    values = []
    for v in obj.values():
        if isinstance(v, list):
            values.extend(v)
        else:
            values.append(v)

    return [str(v).strip() for v in values if v]


def normalize_label(label_raw):
    """
    Normalize label to canonical METADATA_FIELDS entry.
    """
    if isinstance(label_raw, str):
        label = label_raw.strip()
    elif isinstance(label_raw, dict):
        texts = extract_langmap_text(label_raw)
        label = texts[0] if texts else ""
    else:
        return None

    key = label.lower().strip()
    return FIELD_LOOKUP.get(key)


def normalize_metadata(metadata):
    """
    Normalize IIIF metadata into:
    { canonical_label: [value1, value2, ...] }
    """
    normalized = defaultdict(list)

    if not isinstance(metadata, list):
        return normalized

    for entry in metadata:
        if not isinstance(entry, dict):
            continue

        canonical_label = normalize_label(entry.get("label"))
        if not canonical_label:
            continue

        value_raw = entry.get("value")

        if isinstance(value_raw, list):
            values = [str(v).strip() for v in value_raw]

        elif isinstance(value_raw, dict):
            values = extract_langmap_text(value_raw)

        elif value_raw is not None:
            values = [str(value_raw).strip()]

        else:
            values = []

        for v in values:
            normalized[canonical_label].append(v)

    return normalized


def process_directory(json_dir, output_csv):
    rows = []
    max_counts = defaultdict(int)

    for filename in sorted(os.listdir(json_dir)):
        if not filename.lower().endswith(".json"):
            continue

        json_path = os.path.join(json_dir, filename)

        try:
            with open(json_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception:
            continue

        label_raw = data.get("label", "")
        label_value = ""

        if isinstance(label_raw, str):
            label_value = label_raw.strip()
        elif isinstance(label_raw, dict):
            texts = extract_langmap_text(label_raw)
            label_value = texts[0] if texts else ""

        row = {
            "filename": filename,
            "label": label_value
        }


        metadata = normalize_metadata(data.get("metadata", []))

        for field in METADATA_FIELDS:
            values = metadata.get(field, [])

            for i, value in enumerate(values, start=1):
                col_name = f"{field}_{i}"
                row[col_name] = value
                max_counts[field] = max(max_counts[field], i)

        derivatives = find_matching_strings(data)
        row["derivatives"] = ";".join(derivatives)

        rows.append(row)

    # Build CSV header
    header = ["filename", "label"]

    for field in METADATA_FIELDS:
        for i in range(1, max_counts[field] + 1):
            header.append(f"{field}_{i}")

    header.append("derivatives")

    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        writer.writerow(header)

        for row in rows:
            writer.writerow([row.get(col, "") for col in header])


if __name__ == "__main__":
    process_directory("path/to/jsons", "output.csv")
Intermediate steps¶

Open the output spreadsheet in Google Sheets. You will need to make it follow the format of the spreadsheet described here.
Read this page if you wish to add linked data before uploading to Archipelago.