142 lines
4.8 KiB
Python
142 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Parse European Union ISIL directory to LinkML format."""
|
|
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
def parse_eur_isil_directory(filepath: str):
|
|
"""Parse EUR ISIL text file."""
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract EUR ISIL entries using regex
|
|
pattern = r'EUR-[A-Z0-9]+\s+.*?(?=EUR-[A-Z0-9]+|$)'
|
|
entries = re.findall(pattern, content, re.DOTALL)
|
|
|
|
institutions = []
|
|
|
|
for entry in entries:
|
|
lines = [l.strip() for l in entry.split('\n') if l.strip()]
|
|
if not lines:
|
|
continue
|
|
|
|
# First line has ISIL code
|
|
first_line = lines[0]
|
|
isil_match = re.match(r'(EUR-[A-Z0-9]+)', first_line)
|
|
if not isil_match:
|
|
continue
|
|
|
|
isil_code = isil_match.group(1)
|
|
|
|
# Extract organization name, location, etc.
|
|
org_name = None
|
|
city = None
|
|
country = None
|
|
address = None
|
|
sub_unit = None
|
|
|
|
for line in lines:
|
|
# Organization name patterns
|
|
if 'European' in line or 'Court' in line or 'Committee' in line:
|
|
org_name = re.sub(r'EUR-[A-Z0-9]+', '', line).strip()
|
|
org_name = re.sub(r'\d{2}-[A-Za-z]{3}-\d{2,4}', '', org_name).strip()
|
|
org_name = re.sub(r'\s+', ' ', org_name).strip()
|
|
if not org_name or org_name == '/':
|
|
continue
|
|
|
|
# Sub-unit
|
|
if 'Library' in line or 'Archives' in line or 'Historical' in line:
|
|
if not org_name or org_name in line:
|
|
sub_unit = line.strip()
|
|
|
|
# Location
|
|
if 'Belgium' in line:
|
|
country = 'BE'
|
|
if 'Brussels' in line:
|
|
city = 'Brussels'
|
|
elif 'Luxembourg' in line:
|
|
country = 'LU'
|
|
city = 'Luxembourg'
|
|
elif 'Italy' in line:
|
|
country = 'IT'
|
|
if 'Florence' in line:
|
|
city = 'Florence'
|
|
|
|
# Address
|
|
if 'Rue' in line or 'rue' in line or 'Building' in line or 'Via' in line:
|
|
address = line.strip()
|
|
|
|
if not org_name:
|
|
continue
|
|
|
|
# Determine institution type
|
|
inst_type = 'OFFICIAL_INSTITUTION'
|
|
if sub_unit and ('Library' in sub_unit or 'Bibliothèque' in sub_unit):
|
|
inst_type = 'LIBRARY'
|
|
elif sub_unit and 'Archives' in sub_unit:
|
|
inst_type = 'ARCHIVE'
|
|
|
|
# Create LinkML record
|
|
inst_id = f"https://w3id.org/heritage/custodian/eur/{isil_code.lower().replace('-', '')}"
|
|
|
|
institution = {
|
|
'id': inst_id,
|
|
'name': org_name,
|
|
'institution_type': inst_type,
|
|
'description': f'European Union institution: {org_name}',
|
|
'locations': [{
|
|
'country': country,
|
|
'city': city,
|
|
'street_address': address
|
|
}],
|
|
'identifiers': [
|
|
{
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': isil_code,
|
|
'identifier_url': f'https://isil.org/{isil_code}'
|
|
}
|
|
],
|
|
'provenance': {
|
|
'data_source': 'CSV_REGISTRY',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Text parsing from EUR ISIL directory',
|
|
'confidence_score': 0.95
|
|
}
|
|
}
|
|
|
|
if sub_unit and sub_unit != org_name:
|
|
institution['alternative_names'] = [sub_unit]
|
|
|
|
institutions.append(institution)
|
|
|
|
return institutions
|
|
|
|
def main():
|
|
print("Parsing European Union ISIL directory...")
|
|
|
|
filepath = "data/isil/EUR/isil-directory.txt"
|
|
institutions = parse_eur_isil_directory(filepath)
|
|
|
|
print(f"✓ Parsed {len(institutions)} EU institutions")
|
|
|
|
# Save to YAML
|
|
output_file = "data/instances/eu_institutions.yaml"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# European Union ISIL Registry\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(institutions)}\n\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved to {output_file}")
|
|
|
|
# Print summary
|
|
print("\nInstitutions:")
|
|
for inst in institutions:
|
|
print(f" {inst['identifiers'][0]['identifier_value']}: {inst['name']}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|