import os
import re
import yaml

# Configuration
SCHEMA_DIR = "schemas/20251121/linkml"
ONTOLOGY_DIR = "data/ontology"

# Mapping from prefix to filename (based on ONTOLOGY_CATALOG.md and ls output)
# Corrected based on actual file existence
PREFIX_TO_FILE = {
    "crm": "CIDOC_CRM_v7.1.3.rdf",
    "crmgeo": "CRMgeo_v1_2.rdfs",
    "rico": "RiC-O_1-1.rdf",
    "edm": "edm.owl",
    "bf": "bibframe.rdf",
    "premis": "premis3.owl",
    "pico": "pico.ttl",
    "foaf": "foaf.ttl",
    "org": "org.rdf",
    "cpov": "core-public-organisation-ap.ttl",
    "regorg": "regorg.ttl",
    "tooi": "tooiont.ttl",
    "gleif-base": "gleif_base.ttl",
    "gleif-L1": "gleif_l1.ttl",
    "gleif-L2": "gleif_l2.ttl",
    "gleif-elf": "gleif_legal_form.ttl",
    "gleif-ra": "gleif_ra.ttl",
    "ebg": "ebg-ontology.ttl",
    "fibo-fnd": "fibo.rdf",
    "gn": "geonames_ontology.rdf",
    "geo": "geo.ttl",
    "wgs84_pos": "wgs84_pos.rdf",
    "lcc-cr": "lcc-cr.rdf",
    "lcc-3166-1": "lcc-3166-1.rdf",
    "lcc-3166-2": "lcc-3166-2.rdf",
    "lcc-lr": "lcc-lr.rdf",
    "lcc-639-1": "lcc-639-1.rdf",
    "sosa": "sosa.ttl",
    "ssn": "ssn.ttl",
    "skos": "skos.rdf",
    "dc": "dublin_core_elements.rdf",
    "dcterms": "dcterms.rdf", # Changed from dublin_core_elements.rdf to dcterms.rdf
    "dcat": "dcat3.ttl",
    "schema": "schemaorg.owl",
    "vcard": "vcard.ttl",
    "dqv": "dqv.ttl",
    "adms": "adms.ttl",
    "prov": "prov-o.ttl", # Prefer PROV-O
    "pav": "pav.rdf",
    "time": "time.ttl",
    "ore": "ore.rdf",
    "era": "era_ontology.ttl",
    "doap": "doap.rdf",
    "dbo": "dbpedia_ontology.owl",
    "hydra": "hydra_cg.jsonld",
    "oasis": "oasis.owl",
    "wod": "wod_thing.ttl"
}

# Cache for ontology content
ontology_content = {}

def get_ontology_content(filename):
    if filename in ontology_content:
        return ontology_content[filename]
    
    path = os.path.join(ONTOLOGY_DIR, filename)
    if not os.path.exists(path):
        # specific fallback for commonly confused extensions
        if filename.endswith('.ttl') and os.path.exists(path.replace('.ttl', '.rdf')):
             path = path.replace('.ttl', '.rdf')
        elif filename.endswith('.rdf') and os.path.exists(path.replace('.rdf', '.ttl')):
             path = path.replace('.rdf', '.ttl')
        else:
            return None
            
    try:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            ontology_content[filename] = content
            return content
    except Exception as e:
        return None

def check_term_in_ontology(prefix, term, filename):
    content = get_ontology_content(filename)
    if content is None:
        return False # Ontology file missing
    
    # Simple heuristic checks
    # 1. Check for "prefix:term" (common in Turtle)
    if f"{prefix}:{term}" in content:
        return True
    # 2. Check for "term" (if defined with local base or ID) - risky but often needed for XML/RDF
    # We try to be a bit specific: ID="term", about=".../term", >term< (for XML)
    if f'ID="{term}"' in content: return True
    if f'about="{term}"' in content: return True
    if f'/{term}"' in content: return True
    if f'#{term}"' in content: return True
    if f' {term} ' in content: return True # Very loose
    
    # 3. Check for ":term" (Turtle/N3 local name)
    if f":{term}" in content: return True
    # 4. Check for term as a subject in RDF/XML "rdf:ID=" or "rdf:about="
    if f'rdf:ID="{term}"' in content: return True
    if f'rdf:about="{term}"' in content: return True
    if f'rdf:about="#{term}"' in content: return True
    
    # Special case for schema.org which uses http://schema.org/Term
    if prefix == 'schema' and f"schema.org/{term}" in content:
        return True
        
    return False

def verify_files():
    # Walk through schema directory
    for root, dirs, files in os.walk(SCHEMA_DIR):
        # Exclude archive directory
        if "archive" in dirs:
            dirs.remove("archive")
        
        # Also skip if current root is inside an archive directory
        if "/archive" in root or os.path.sep + "archive" in root:
            continue

        for file in files:
            if not file.endswith(".yaml"):
                continue
            
            filepath = os.path.join(root, file)
            try:
                with open(filepath, 'r') as f:
                    content = f.read()
            except:
                continue

            # Find all CURIEs: prefix:Term
            # Regex to find patterns like " prefix:Term " or "prefix:Term"
            # Exclude http:// and https://
            curies = re.findall(r'(?<!http:)(?<!https:)\b([a-z0-9-]+):([a-zA-Z0-9_]+)\b', content)
            
            for prefix, term in curies:
                if prefix in PREFIX_TO_FILE:
                    filename = PREFIX_TO_FILE[prefix]
                    exists = check_term_in_ontology(prefix, term, filename)
                    if not exists:
                        print(f"File: {filepath} | Message: Term not found in {filename} ontology file. | Term: {prefix}:{term}")

if __name__ == "__main__":
    verify_files()