#!/usr/bin/env python3 """ Add claim-level provenance to enriched NDE YAML files. This script adds proper provenance tracking to all enriched files, recording where each piece of data came from. Provenance Schema: - Each claim (data point) can have its own source reference - Sources include: original_entry, wikidata_api, google_maps_api, exa_web_search, webfetch - For web content: URL, fetch_timestamp, text_excerpt (exact quoted text) - For Exa: highlights with scores, markdown section path """ import os import sys import yaml from pathlib import Path from datetime import datetime, timezone from typing import Any, Dict, List, Optional import re # Preserve YAML formatting class PreservingDumper(yaml.SafeDumper): pass def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) PreservingDumper.add_representer(str, str_representer) def create_source_reference( source_type: str, url: Optional[str] = None, fetch_timestamp: Optional[str] = None, text_excerpt: Optional[str] = None, markdown_section: Optional[str] = None, exa_highlight_score: Optional[float] = None, api_endpoint: Optional[str] = None, entity_id: Optional[str] = None, place_id: Optional[str] = None, claims_extracted: Optional[List[str]] = None, ) -> Dict[str, Any]: """Create a standardized source reference object.""" ref = {"source_type": source_type} if url: ref["url"] = url if fetch_timestamp: ref["fetch_timestamp"] = fetch_timestamp if text_excerpt: ref["text_excerpt"] = text_excerpt if markdown_section: ref["markdown_section"] = markdown_section if exa_highlight_score is not None: ref["exa_highlight_score"] = exa_highlight_score if api_endpoint: ref["api_endpoint"] = api_endpoint if entity_id: ref["entity_id"] = entity_id if place_id: ref["place_id"] = place_id if claims_extracted: ref["claims_extracted"] = claims_extracted return ref def extract_provenance_from_existing(data: Dict[str, Any]) -> Dict[str, List[Dict]]: """ Extract provenance information from existing enrichment data and organize it by claim type. """ provenance = {} # 1. Original NDE Entry provenance if "original_entry" in data: provenance["original_entry"] = [{ "source_type": "nde_csv_registry", "data_tier": "TIER_1_AUTHORITATIVE", "claims_extracted": list(data["original_entry"].keys()), }] # 2. Wikidata enrichment provenance if "wikidata_enrichment" in data: wd = data["wikidata_enrichment"] api_meta = wd.get("api_metadata", {}) wd_ref = create_source_reference( source_type="wikidata_api", api_endpoint=api_meta.get("api_endpoint", "https://www.wikidata.org/w/rest.php/wikibase/v1"), fetch_timestamp=api_meta.get("fetch_timestamp"), entity_id=wd.get("wikidata_entity_id"), claims_extracted=[] ) # Track which claims came from Wikidata wikidata_claims = [] if "wikidata_labels" in wd: wikidata_claims.append("labels") if "wikidata_descriptions" in wd: wikidata_claims.append("descriptions") if "wikidata_instance_of" in wd: wikidata_claims.append("instance_of") if "wikidata_country" in wd: wikidata_claims.append("country") if "wikidata_located_in" in wd: wikidata_claims.append("located_in") if "wikidata_coordinates" in wd: wikidata_claims.append("coordinates") if "wikidata_official_website" in wd: wikidata_claims.append("official_website") if "wikidata_claims" in wd: for claim_key in wd["wikidata_claims"].keys(): wikidata_claims.append(f"claim_{claim_key}") wd_ref["claims_extracted"] = wikidata_claims provenance["wikidata"] = [wd_ref] # 3. Google Maps enrichment provenance if "google_maps_enrichment" in data: gm = data["google_maps_enrichment"] gm_ref = create_source_reference( source_type="google_maps_api", api_endpoint="https://maps.googleapis.com/maps/api/place/textsearch", fetch_timestamp=gm.get("fetch_timestamp"), place_id=gm.get("place_id"), claims_extracted=[] ) # Track which claims came from Google Maps gm_claims = [] if "coordinates" in gm: gm_claims.append("coordinates") if "formatted_address" in gm: gm_claims.append("formatted_address") if "address_components" in gm: gm_claims.append("address_components") if "business_status" in gm: gm_claims.append("business_status") if "website" in gm: gm_claims.append("website") if "phone_local" in gm or "phone_international" in gm: gm_claims.append("phone") if "rating" in gm: gm_claims.append("rating") if "reviews" in gm: gm_claims.append("reviews") if "opening_hours" in gm: gm_claims.append("opening_hours") gm_ref["claims_extracted"] = gm_claims provenance["google_maps"] = [gm_ref] # 4. Exa enrichment provenance (if exists) if "exa_enrichment" in data: exa = data["exa_enrichment"] exa_ref = create_source_reference( source_type="exa_web_search", url=exa.get("source_url"), fetch_timestamp=exa.get("fetch_timestamp"), claims_extracted=[] ) # If we have highlights, include them if "highlights" in exa: exa_ref["highlights"] = exa["highlights"] provenance["exa"] = [exa_ref] # 5. Website enrichment provenance (direct webfetch) if "website_enrichment" in data or "organization_details" in data: # Check for source_references already in organization_details org = data.get("organization_details", {}) if "source_references" in org: provenance["website"] = org["source_references"] return provenance def add_provenance_section(data: Dict[str, Any]) -> Dict[str, Any]: """ Add a consolidated provenance section to the YAML data. """ # Extract existing provenance existing_prov = extract_provenance_from_existing(data) # Create consolidated provenance section provenance_section = { "provenance": { "schema_version": "1.0.0", "generated_at": datetime.now(timezone.utc).isoformat(), "sources": existing_prov, "data_tier_summary": { "TIER_1_AUTHORITATIVE": ["original_entry (NDE CSV)"], "TIER_2_VERIFIED": ["wikidata_api", "google_maps_api"], "TIER_3_CROWD_SOURCED": [], "TIER_4_INFERRED": ["website_scrape", "exa_web_search"], }, "notes": [ "Provenance tracking added retroactively", "claim_level_provenance available in sources section", ] } } # Insert provenance section after entry_index new_data = {} for key, value in data.items(): new_data[key] = value if key == "enrichment_status": new_data["provenance"] = provenance_section["provenance"] # If enrichment_status wasn't found, add at end if "provenance" not in new_data: new_data["provenance"] = provenance_section["provenance"] return new_data def has_provenance(data: Dict[str, Any]) -> bool: """Check if file already has provenance section.""" return "provenance" in data and "schema_version" in data.get("provenance", {}) def process_file(filepath: Path, dry_run: bool = False) -> bool: """ Process a single YAML file and add provenance if needed. Returns True if file was updated. """ try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data is None: print(f" Skipping empty file: {filepath.name}") return False if has_provenance(data): print(f" Already has provenance: {filepath.name}") return False # Check if it's an enriched file (has any enrichment data) if not any(key in data for key in ["wikidata_enrichment", "google_maps_enrichment", "organization_details", "website_enrichment", "exa_enrichment"]): print(f" Not enriched, skipping: {filepath.name}") return False # Add provenance new_data = add_provenance_section(data) if dry_run: print(f" Would update: {filepath.name}") return True # Write updated file with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(new_data, f, Dumper=PreservingDumper, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" Updated: {filepath.name}") return True except Exception as e: print(f" Error processing {filepath.name}: {e}") return False def main(): import argparse parser = argparse.ArgumentParser(description="Add provenance to enriched NDE YAML files") parser.add_argument("--dry-run", action="store_true", help="Don't actually modify files") parser.add_argument("--limit", type=int, default=None, help="Limit number of files to process") parser.add_argument("--pattern", type=str, default="*_Q*.yaml", help="File pattern to match") args = parser.parse_args() entries_dir = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries") if not entries_dir.exists(): print(f"Error: Directory not found: {entries_dir}") sys.exit(1) # Find all enriched files files = sorted(entries_dir.glob(args.pattern)) if args.limit: files = files[:args.limit] print(f"Found {len(files)} files matching pattern '{args.pattern}'") print(f"Dry run: {args.dry_run}") print() updated = 0 skipped = 0 errors = 0 for filepath in files: result = process_file(filepath, dry_run=args.dry_run) if result: updated += 1 else: skipped += 1 print() print(f"Summary:") print(f" Updated: {updated}") print(f" Skipped: {skipped}") print(f" Total: {len(files)}") if __name__ == "__main__": main()