#!/usr/bin/env python3 """Parse European Union ISIL directory to LinkML format.""" import re import yaml from datetime import datetime, timezone def parse_eur_isil_directory(filepath: str): """Parse EUR ISIL text file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Extract EUR ISIL entries using regex pattern = r'EUR-[A-Z0-9]+\s+.*?(?=EUR-[A-Z0-9]+|$)' entries = re.findall(pattern, content, re.DOTALL) institutions = [] for entry in entries: lines = [l.strip() for l in entry.split('\n') if l.strip()] if not lines: continue # First line has ISIL code first_line = lines[0] isil_match = re.match(r'(EUR-[A-Z0-9]+)', first_line) if not isil_match: continue isil_code = isil_match.group(1) # Extract organization name, location, etc. org_name = None city = None country = None address = None sub_unit = None for line in lines: # Organization name patterns if 'European' in line or 'Court' in line or 'Committee' in line: org_name = re.sub(r'EUR-[A-Z0-9]+', '', line).strip() org_name = re.sub(r'\d{2}-[A-Za-z]{3}-\d{2,4}', '', org_name).strip() org_name = re.sub(r'\s+', ' ', org_name).strip() if not org_name or org_name == '/': continue # Sub-unit if 'Library' in line or 'Archives' in line or 'Historical' in line: if not org_name or org_name in line: sub_unit = line.strip() # Location if 'Belgium' in line: country = 'BE' if 'Brussels' in line: city = 'Brussels' elif 'Luxembourg' in line: country = 'LU' city = 'Luxembourg' elif 'Italy' in line: country = 'IT' if 'Florence' in line: city = 'Florence' # Address if 'Rue' in line or 'rue' in line or 'Building' in line or 'Via' in line: address = line.strip() if not org_name: continue # Determine institution type inst_type = 'OFFICIAL_INSTITUTION' if sub_unit and ('Library' in sub_unit or 'Bibliothèque' in sub_unit): inst_type = 'LIBRARY' elif sub_unit and 'Archives' in sub_unit: inst_type = 'ARCHIVE' # Create LinkML record inst_id = f"https://w3id.org/heritage/custodian/eur/{isil_code.lower().replace('-', '')}" institution = { 'id': inst_id, 'name': org_name, 'institution_type': inst_type, 'description': f'European Union institution: {org_name}', 'locations': [{ 'country': country, 'city': city, 'street_address': address }], 'identifiers': [ { 'identifier_scheme': 'ISIL', 'identifier_value': isil_code, 'identifier_url': f'https://isil.org/{isil_code}' } ], 'provenance': { 'data_source': 'CSV_REGISTRY', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Text parsing from EUR ISIL directory', 'confidence_score': 0.95 } } if sub_unit and sub_unit != org_name: institution['alternative_names'] = [sub_unit] institutions.append(institution) return institutions def main(): print("Parsing European Union ISIL directory...") filepath = "data/isil/EUR/isil-directory.txt" institutions = parse_eur_isil_directory(filepath) print(f"✓ Parsed {len(institutions)} EU institutions") # Save to YAML output_file = "data/instances/eu_institutions.yaml" with open(output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# European Union ISIL Registry\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Total institutions: {len(institutions)}\n\n") yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved to {output_file}") # Print summary print("\nInstitutions:") for inst in institutions: print(f" {inst['identifiers'][0]['identifier_value']}: {inst['name']}") if __name__ == '__main__': main()