glam/parse_eu_isil.py
2025-11-19 23:25:22 +01:00

142 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""Parse European Union ISIL directory to LinkML format."""
import re
import yaml
from datetime import datetime, timezone
def parse_eur_isil_directory(filepath: str):
"""Parse EUR ISIL text file."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Extract EUR ISIL entries using regex
pattern = r'EUR-[A-Z0-9]+\s+.*?(?=EUR-[A-Z0-9]+|$)'
entries = re.findall(pattern, content, re.DOTALL)
institutions = []
for entry in entries:
lines = [l.strip() for l in entry.split('\n') if l.strip()]
if not lines:
continue
# First line has ISIL code
first_line = lines[0]
isil_match = re.match(r'(EUR-[A-Z0-9]+)', first_line)
if not isil_match:
continue
isil_code = isil_match.group(1)
# Extract organization name, location, etc.
org_name = None
city = None
country = None
address = None
sub_unit = None
for line in lines:
# Organization name patterns
if 'European' in line or 'Court' in line or 'Committee' in line:
org_name = re.sub(r'EUR-[A-Z0-9]+', '', line).strip()
org_name = re.sub(r'\d{2}-[A-Za-z]{3}-\d{2,4}', '', org_name).strip()
org_name = re.sub(r'\s+', ' ', org_name).strip()
if not org_name or org_name == '/':
continue
# Sub-unit
if 'Library' in line or 'Archives' in line or 'Historical' in line:
if not org_name or org_name in line:
sub_unit = line.strip()
# Location
if 'Belgium' in line:
country = 'BE'
if 'Brussels' in line:
city = 'Brussels'
elif 'Luxembourg' in line:
country = 'LU'
city = 'Luxembourg'
elif 'Italy' in line:
country = 'IT'
if 'Florence' in line:
city = 'Florence'
# Address
if 'Rue' in line or 'rue' in line or 'Building' in line or 'Via' in line:
address = line.strip()
if not org_name:
continue
# Determine institution type
inst_type = 'OFFICIAL_INSTITUTION'
if sub_unit and ('Library' in sub_unit or 'Bibliothèque' in sub_unit):
inst_type = 'LIBRARY'
elif sub_unit and 'Archives' in sub_unit:
inst_type = 'ARCHIVE'
# Create LinkML record
inst_id = f"https://w3id.org/heritage/custodian/eur/{isil_code.lower().replace('-', '')}"
institution = {
'id': inst_id,
'name': org_name,
'institution_type': inst_type,
'description': f'European Union institution: {org_name}',
'locations': [{
'country': country,
'city': city,
'street_address': address
}],
'identifiers': [
{
'identifier_scheme': 'ISIL',
'identifier_value': isil_code,
'identifier_url': f'https://isil.org/{isil_code}'
}
],
'provenance': {
'data_source': 'CSV_REGISTRY',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Text parsing from EUR ISIL directory',
'confidence_score': 0.95
}
}
if sub_unit and sub_unit != org_name:
institution['alternative_names'] = [sub_unit]
institutions.append(institution)
return institutions
def main():
print("Parsing European Union ISIL directory...")
filepath = "data/isil/EUR/isil-directory.txt"
institutions = parse_eur_isil_directory(filepath)
print(f"✓ Parsed {len(institutions)} EU institutions")
# Save to YAML
output_file = "data/instances/eu_institutions.yaml"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# European Union ISIL Registry\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Total institutions: {len(institutions)}\n\n")
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved to {output_file}")
# Print summary
print("\nInstitutions:")
for inst in institutions:
print(f" {inst['identifiers'][0]['identifier_value']}: {inst['name']}")
if __name__ == '__main__':
main()