638 lines
21 KiB
Python
638 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Optimize AGENTS.md by condensing verbose rules into summaries with .opencode references.
|
|
|
|
This script:
|
|
1. Preserves header (lines 1-23)
|
|
2. Condenses rules 0-34 (lines 24-2775) into enhanced summaries with .opencode references
|
|
3. Preserves all unique reference content (lines 2776-5332) in full
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
# Enhanced rule mappings with more detail for key rules
|
|
RULE_CONTENT = {
|
|
0: """### Rule 0: LinkML Schemas Are the Single Source of Truth
|
|
|
|
🚨 **CRITICAL**: LinkML schema files in `schemas/20251121/linkml/` are the authoritative definition of the Heritage Custodian Ontology.
|
|
|
|
**Key Points**:
|
|
- ALL derived files (RDF, TypeDB, UML) are GENERATED - never edit them directly
|
|
- Always use full timestamps (`YYYYMMDD_HHMMSS`) in generated filenames
|
|
- Primary schema: `schemas/20251121/linkml/01_custodian_name.yaml`
|
|
|
|
**Workflow**:
|
|
```
|
|
1. EDIT LinkML schema
|
|
2. REGENERATE: gen-owl → rdfpipe → all 8 RDF formats
|
|
3. REGENERATE: gen-yuml → UML diagrams
|
|
4. UPDATE: TypeDB schema (manual)
|
|
5. VALIDATE: linkml-validate
|
|
```
|
|
|
|
**See**: `.opencode/SCHEMA_GENERATION_RULES.md` for complete generation rules
|
|
|
|
---
|
|
""",
|
|
|
|
1: """### Rule 1: Ontology Files Are Your Primary Reference
|
|
|
|
🚨 **CRITICAL**: Before designing any schema, class, or property, consult base ontologies.
|
|
|
|
**Required Steps**:
|
|
1. READ base ontology files in `/data/ontology/`
|
|
2. SEARCH for existing classes and properties
|
|
3. DOCUMENT your ontology alignment with rationale
|
|
4. NEVER invent custom properties when ontology equivalents exist
|
|
|
|
**Available Ontologies**:
|
|
- `tooiont.ttl` - TOOI (Dutch government)
|
|
- `core-public-organisation-ap.ttl` - CPOV (EU public sector)
|
|
- `schemaorg.owl` - Schema.org (web semantics)
|
|
- `CIDOC_CRM_v7.1.3.rdf` - CIDOC-CRM (cultural heritage)
|
|
- `RiC-O_1-1.rdf` - Records in Contexts (archival)
|
|
- `pico.ttl` - PiCo (person observations)
|
|
|
|
**See**: `.opencode/HYPER_MODULAR_STRUCTURE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
2: """### Rule 2: Wikidata Entities Are NOT Ontology Classes
|
|
|
|
🚨 **CRITICAL**: Files in `data/wikidata/GLAMORCUBEPSXHFN/` contain Wikidata Q-numbers for institution TYPES, NOT formal ontology class definitions.
|
|
|
|
**Workflow**: `Wikidata Q-number → Analyze semantics → Search ontologies → Map to ontology class → Document rationale`
|
|
|
|
**Note**: Full rule content preserved in Appendix below (no .opencode equivalent).
|
|
|
|
---
|
|
""",
|
|
|
|
3: """### Rule 3: Multi-Aspect Modeling is Mandatory
|
|
|
|
🚨 **CRITICAL**: Every heritage entity has MULTIPLE ontological aspects with INDEPENDENT temporal lifecycles.
|
|
|
|
**Required Aspects**:
|
|
| Aspect | Ontology Class | Temporal Example |
|
|
|--------|---------------|------------------|
|
|
| Place | `crm:E27_Site` | Building: 1880-present |
|
|
| Custodian | `cpov:PublicOrganisation` | Foundation: 1994-present |
|
|
| Legal Form | `org:FormalOrganization` | Registration: 1994-present |
|
|
| Collections | `rico:RecordSet` | Accession dates vary |
|
|
| People | `pico:PersonObservation` | Employment: 2020-present |
|
|
| Events | `crm:E10_Transfer_of_Custody` | Discrete timestamps |
|
|
|
|
**Note**: Full rule content preserved in Appendix below (no .opencode equivalent).
|
|
|
|
---
|
|
""",
|
|
|
|
4: """### Rule 4: Technical Classes Are Excluded from Visualizations
|
|
|
|
🚨 **CRITICAL**: Some LinkML classes exist solely for validation (e.g., `Container` with `tree_root: true`). These have NO semantic significance and MUST be excluded from UML diagrams.
|
|
|
|
**Excluded Classes**: `Container` (tree_root for validation only)
|
|
|
|
**See**: `.opencode/LINKML_TECHNICAL_CLASSES.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
5: """### Rule 5: NEVER Delete Enriched Data - Additive Only
|
|
|
|
🚨 **CRITICAL**: Data enrichment is ADDITIVE ONLY. Never delete or overwrite existing enriched content.
|
|
|
|
**Protected Data Types**:
|
|
| Source | Protected Fields |
|
|
|--------|------------------|
|
|
| Google Maps | `reviews`, `rating`, `photo_count`, `popular_times`, `place_id` |
|
|
| OpenStreetMap | `osm_id`, `osm_type`, `osm_tags`, `amenity`, `heritage` |
|
|
| Wikidata | `wikidata_id`, `claims`, `sitelinks`, `aliases` |
|
|
| Website Scrape | `organization_details`, `collections`, `contact`, `social_media` |
|
|
| ISIL Registry | `isil_code`, `assigned_date`, `remarks` |
|
|
|
|
**See**: `.opencode/DATA_PRESERVATION_RULES.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
6: """### Rule 6: WebObservation Claims MUST Have XPath Provenance
|
|
|
|
🚨 **CRITICAL**: Every claim extracted from a webpage MUST have an XPath pointer to the exact location in archived HTML. Claims without XPath provenance are FABRICATED.
|
|
|
|
**Required Fields**:
|
|
```yaml
|
|
claim_type: full_name
|
|
claim_value: "Institution Name"
|
|
source_url: https://example.org/about
|
|
retrieved_on: "2025-11-29T12:28:00Z"
|
|
xpath: /html/body/div[1]/h1
|
|
html_file: web/GHCID/example.org/rendered.html
|
|
xpath_match_score: 1.0
|
|
```
|
|
|
|
**Scope**: Applies to `WebClaim` and `WebObservation` classes. Other classes (CustodianTimelineEvent, GoogleMapsEnrichment) have different provenance models.
|
|
|
|
**See**: `.opencode/WEB_OBSERVATION_PROVENANCE_RULES.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
7: """### Rule 7: Deployment is LOCAL via SSH/rsync (NO CI/CD)
|
|
|
|
🚨 **CRITICAL**: NO GitHub Actions. ALL deployments executed locally via SSH and rsync.
|
|
|
|
**Server**: `91.98.224.44` (Hetzner Cloud)
|
|
|
|
**Two Frontend Apps** (MONOREPO):
|
|
| Domain | Local Directory | Server Path |
|
|
|--------|-----------------|-------------|
|
|
| bronhouder.nl | `/frontend/` | `/var/www/glam-frontend/` |
|
|
| archief.support | `/apps/archief-assistent/` | `/var/www/archief-assistent/` |
|
|
|
|
**Deployment Commands**:
|
|
```bash
|
|
./infrastructure/deploy.sh --frontend # bronhouder.nl
|
|
./infrastructure/deploy.sh --data # Data files only
|
|
./infrastructure/deploy.sh --status # Check server
|
|
```
|
|
|
|
**See**: `.opencode/DEPLOYMENT_RULES.md` and `.opencode/MONOREPO_FRONTEND_APPS.md`
|
|
|
|
---
|
|
""",
|
|
|
|
8: """### Rule 8: Legal Form Terms MUST Be Filtered from CustodianName
|
|
|
|
🚨 **CRITICAL**: Exception to emic principle - Legal forms are ALWAYS filtered from CustodianName.
|
|
|
|
**Examples**: `Stichting Rijksmuseum` → CustodianName: `Rijksmuseum`, Legal Form: `Stichting`
|
|
|
|
**Terms to Filter** (by language):
|
|
- Dutch: Stichting, B.V., N.V., Coöperatie
|
|
- English: Foundation, Inc., Ltd., LLC, Corp.
|
|
- German: Stiftung, e.V., GmbH, AG
|
|
- French: Fondation, S.A., S.A.R.L.
|
|
|
|
**NOT Filtered** (part of identity): Vereniging, Association, Society, Verein
|
|
|
|
**See**: `.opencode/LEGAL_FORM_FILTERING_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
9: """### Rule 9: Enum-to-Class Promotion - Single Source of Truth
|
|
|
|
🚨 **CRITICAL**: When an enum is promoted to a class hierarchy, the original enum MUST be deleted. Never maintain parallel enum/class definitions.
|
|
|
|
**Archive Location**: `schemas/20251121/linkml/archive/enums/`
|
|
|
|
**See**: `.opencode/ENUM_TO_CLASS_PRINCIPLE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
10: """### Rule 10: CH-Annotator is the Entity Annotation Convention
|
|
|
|
🚨 **CRITICAL**: All entity annotation follows `ch_annotator-v1_7_0` convention.
|
|
|
|
**9 Hypernym Types**: AGT (Agent), GRP (Group), TOP (Toponym), GEO (Geometry), TMP (Temporal), APP (Appellation), ROL (Role), WRK (Work), QTY (Quantity)
|
|
|
|
**Heritage Institutions**: `GRP.HER` with GLAMORCUBESFIXPHDNT subtypes (GRP.HER.MUS, GRP.HER.LIB, GRP.HER.ARC, etc.)
|
|
|
|
**See**: `.opencode/CH_ANNOTATOR_CONVENTION.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
11: """### Rule 11: Z.AI GLM API for LLM Tasks (NOT BigModel)
|
|
|
|
🚨 **CRITICAL**: Use Z.AI Coding Plan endpoint, NOT regular BigModel API.
|
|
|
|
**Configuration**:
|
|
- API URL: `https://api.z.ai/api/coding/paas/v4/chat/completions`
|
|
- Environment Variable: `ZAI_API_TOKEN`
|
|
- Models: `glm-4.5`, `glm-4.5-air`, `glm-4.5-flash`, `glm-4.6`
|
|
- Cost: Free (0 per token)
|
|
|
|
**See**: `.opencode/ZAI_GLM_API_RULES.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
12: """### Rule 12: Person Data Reference Pattern - Avoid Inline Duplication
|
|
|
|
🚨 **CRITICAL**: Person profiles stored in `data/custodian/person/entity/`. Custodian files reference via `person_profile_path` - NEVER duplicate 50+ lines of profile data inline.
|
|
|
|
**File Naming**: `{linkedin-slug}_{ISO-timestamp}.json`
|
|
|
|
**See**: `.opencode/PERSON_DATA_REFERENCE_PATTERN.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
13: """### Rule 13: Custodian Type Annotations on LinkML Schema Elements
|
|
|
|
🚨 **CRITICAL**: All schema elements MUST have `custodian_types` annotation with GLAMORCUBESFIXPHDNT single-letter codes.
|
|
|
|
**Annotation Keys**: `custodian_types` (list), `custodian_types_rationale` (string), `custodian_types_primary` (string)
|
|
|
|
**Universal**: Use `["*"]` for elements applying to all types.
|
|
|
|
**See**: `.opencode/CUSTODIAN_TYPE_ANNOTATION_CONVENTION.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
14: """### Rule 14: Exa MCP LinkedIn Profile Extraction
|
|
|
|
🚨 **CRITICAL**: Use `exa_crawling_exa` with direct URL for comprehensive LinkedIn profile extraction.
|
|
|
|
**Tool Priority**:
|
|
1. `exa_crawling_exa` - Profile URL known (preferred)
|
|
2. `exa_linkedin_search_exa` - Profile URL unknown
|
|
3. `exa_web_search_exa` - Fallback search
|
|
|
|
**Output**: `data/custodian/person/entity/{linkedin-slug}_{timestamp}.json`
|
|
|
|
**See**: `.opencode/EXA_LINKEDIN_EXTRACTION_RULES.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
15: """### Rule 15: Connection Data Registration - Full Network Preservation
|
|
|
|
🚨 **CRITICAL**: ALL LinkedIn connections must be fully registered in dedicated connections files.
|
|
|
|
**File Location**: `data/custodian/person/{slug}_connections_{timestamp}.json`
|
|
|
|
**Required**: `source_metadata`, `connections[]` array, `network_analysis` with heritage type breakdown
|
|
|
|
**See**: `.opencode/CONNECTION_DATA_REGISTRATION_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
16: """### Rule 16: LinkedIn Photo URLs - Store CDN URLs, Not Overlay Pages
|
|
|
|
🚨 **CRITICAL**: Store actual CDN URL, NOT overlay page URL.
|
|
|
|
- ❌ WRONG: `linkedin.com/in/{slug}/overlay/photo/` (derivable, useless)
|
|
- ✅ CORRECT: `media.licdn.com/dms/image/v2/{ID}/profile-displayphoto-shrink_800_800/...`
|
|
|
|
**See**: `.opencode/LINKEDIN_PHOTO_CDN_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
17: """### Rule 17: LinkedIn Connection Unique Identifiers
|
|
|
|
🚨 **CRITICAL**: Every connection gets unique ID including abbreviated and anonymous names.
|
|
|
|
**Format**: `{target_slug}_conn_{index:04d}_{name_slug}`
|
|
|
|
**Name Types**: `full`, `abbreviated` (Amy B.), `anonymous` (LinkedIn Member)
|
|
|
|
**See**: `.opencode/LINKEDIN_CONNECTION_ID_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
18: """### Rule 18: Custodian Staff Parsing from LinkedIn Company Pages
|
|
|
|
🚨 **CRITICAL**: Use `scripts/parse_custodian_staff.py` for staff registration parsing.
|
|
|
|
**Staff ID Format**: `{custodian_slug}_staff_{index:04d}_{name_slug}`
|
|
|
|
**See**: `.opencode/CUSTODIAN_STAFF_PARSING_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
19: """### Rule 19: HTML-Only LinkedIn Extraction (Preferred Method)
|
|
|
|
🚨 **CRITICAL**: Use ONLY manually saved HTML files for LinkedIn data extraction.
|
|
|
|
**Data Completeness**: HTML = 100% (including profile URLs), MD copy-paste = ~90%
|
|
|
|
**Script**: `scripts/parse_linkedin_html.py`
|
|
|
|
**How to Save**: Navigate → Scroll to load all → File > Save Page As > "Webpage, Complete"
|
|
|
|
**See**: `.opencode/HTML_ONLY_LINKEDIN_EXTRACTION_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
20: """### Rule 20: Person Entity Profiles - Individual File Storage
|
|
|
|
🚨 **CRITICAL**: Person profiles stored as individual files in `data/custodian/person/entity/`.
|
|
|
|
**File Naming**: `{linkedin-slug}_{ISO-timestamp}.json`
|
|
|
|
**Required**: ALL profiles MUST use structured JSON with `extraction_agent: "claude-opus-4.5"`. Raw content dumps are NOT acceptable.
|
|
|
|
**See**: `.opencode/PERSON_ENTITY_PROFILE_FORMAT_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
21: """### Rule 21: Data Fabrication is Strictly Prohibited
|
|
|
|
🚨 **CRITICAL**: ALL DATA MUST BE REAL AND VERIFIABLE. Fabricating any data is strictly prohibited.
|
|
|
|
**❌ FORBIDDEN**:
|
|
- Creating fake names, job titles, companies
|
|
- Inventing education history or skills
|
|
- Generating placeholder data when extraction fails
|
|
- Creating fictional LinkedIn URLs
|
|
|
|
**✅ ALLOWED**:
|
|
- Skip profiles that cannot be extracted
|
|
- Return `null` or empty fields for missing data
|
|
- Mark profiles with `extraction_error: true`
|
|
- Log why extraction failed
|
|
|
|
**See**: `.opencode/DATA_FABRICATION_PROHIBITION.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
22: """### Rule 22: Custodian YAML Files Are the Single Source of Truth
|
|
|
|
🚨 **CRITICAL**: `data/custodian/*.yaml` is the SINGLE SOURCE OF TRUTH for all enrichment data.
|
|
|
|
**Data Hierarchy**:
|
|
```
|
|
data/custodian/*.yaml ← SINGLE SOURCE OF TRUTH
|
|
↓
|
|
Ducklake → PostgreSQL → TypeDB → Oxigraph → Qdrant
|
|
(All databases are DERIVED - never add data independently)
|
|
↓
|
|
REST API → Frontend (both DERIVED)
|
|
```
|
|
|
|
**Workflow**: FETCH → VALIDATE → WRITE TO YAML → Import to database → Verify
|
|
|
|
**See**: `.opencode/CUSTODIAN_DATA_SOURCE_OF_TRUTH.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
23: """### Rule 23: Social Media Link Validation - No Generic Links
|
|
|
|
🚨 **CRITICAL**: Social media links MUST be institution-specific, NOT generic platform homepages.
|
|
|
|
**Invalid**: `facebook.com/`, `facebook.com/facebook`, `twitter.com/twitter`
|
|
|
|
**Valid**: `facebook.com/rijksmuseum/`, `twitter.com/rijksmuseum`
|
|
|
|
**See**: `.opencode/SOCIAL_MEDIA_LINK_VALIDATION.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
24: """### Rule 24: Unused Import Investigation - Check Before Removing
|
|
|
|
🚨 **CRITICAL**: Before removing unused imports, INVESTIGATE whether they indicate incomplete implementations.
|
|
|
|
**Checklist**:
|
|
1. Was it recently used? (`git log -p --all -S 'ImportName'`)
|
|
2. Is there a TODO/FIXME?
|
|
3. Pattern mismatch (old vs new syntax)?
|
|
4. Incomplete feature?
|
|
5. Conditional usage (`TYPE_CHECKING` blocks)?
|
|
|
|
**See**: `.opencode/UNUSED_IMPORT_INVESTIGATION_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
25: """### Rule 25: Digital Platform Discovery Enrichment
|
|
|
|
🚨 **CRITICAL**: Every heritage custodian MUST be enriched with digital platform discovery data.
|
|
|
|
**Discover**: Collection management systems, discovery portals, external integrations, APIs
|
|
|
|
**Required Provenance**: `retrieval_agent`, `retrieval_timestamp`, `source_url`, `xpath_base`, `html_file`
|
|
|
|
**See**: `.opencode/DIGITAL_PLATFORM_DISCOVERY_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
26: """### Rule 26: Person Data Provenance - Web Claims for Staff Information
|
|
|
|
🚨 **CRITICAL**: All person/staff data MUST have web claim provenance with verifiable sources.
|
|
|
|
**Required Fields**: `claim_type`, `claim_value`, `source_url`, `retrieved_on`, `retrieval_agent`
|
|
|
|
**Recommended**: `xpath`, `xpath_match_score`
|
|
|
|
**See**: `.opencode/PERSON_DATA_PROVENANCE_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
27: """### Rule 27: Person-Custodian Data Architecture
|
|
|
|
🚨 **CRITICAL**: Person entity files are the SINGLE SOURCE OF TRUTH for all person data.
|
|
|
|
**In Person Entity File**: `extraction_metadata`, `profile_data`, `web_claims`, `affiliations`
|
|
|
|
**In Custodian YAML**: `person_id`, `person_name`, `role_title`, `affiliation_provenance`, `linkedin_profile_path` (reference only)
|
|
|
|
**NEVER**: Put `web_claims` in custodian YAML files
|
|
|
|
**See**: `.opencode/PERSON_CUSTODIAN_DATA_ARCHITECTURE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
28: """### Rule 28: Web Claims Deduplication - No Redundant Claims
|
|
|
|
🚨 **CRITICAL**: Do not duplicate claims unless genuine variation exists with uncertainty.
|
|
|
|
**Eliminate**: Favicon variants, same value from different extractions, dynamic content
|
|
|
|
**Document**: Removed claims in `removed_claims` section for audit trail
|
|
|
|
**See**: `.opencode/WEB_CLAIMS_DEDUPLICATION_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
29: """### Rule 29: Anonymous Profile Name Derivation from LinkedIn Slugs
|
|
|
|
🚨 **CRITICAL**: Names CAN be derived from hyphenated LinkedIn slugs - this is data transformation, NOT fabrication.
|
|
|
|
**Dutch Particles**: Keep lowercase when not first word (van, de, den, der)
|
|
|
|
**Known Compound Slugs**: Use mapping for `jponjee` → "J. Ponjee", etc.
|
|
|
|
**See**: `.opencode/ANONYMOUS_PROFILE_NAME_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
30: """### Rule 30: Person Profile Extraction Confidence Scoring
|
|
|
|
🚨 **CRITICAL**: Every enriched profile MUST have confidence score (0.50-0.95) for data extraction quality.
|
|
|
|
**Distinct from**: Heritage sector relevance score (different purpose)
|
|
|
|
**Scoring Factors**:
|
|
- Clear job title: +0.10 to +0.15
|
|
- Named institution: +0.05 to +0.10
|
|
- Privacy-abbreviated name: -0.15 to -0.20
|
|
- Intern/trainee: -0.10
|
|
|
|
**See**: `.opencode/PERSON_PROFILE_CONFIDENCE_SCORING.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
31: """### Rule 31: Organizational Subdivision Extraction
|
|
|
|
🚨 **CRITICAL**: ALWAYS capture organizational subdivisions as structured data.
|
|
|
|
**Types**: department, team, unit, division, section, lab_or_center, office
|
|
|
|
**Store in**: `affiliations[].subdivision` with `type`, `name`, `parent_subdivision`, `extraction_source`
|
|
|
|
**See**: `.opencode/ORGANIZATIONAL_SUBDIVISION_EXTRACTION.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
32: """### Rule 32: Government Ministries Are Heritage Custodians (Type O)
|
|
|
|
🚨 **CRITICAL**: Government ministries ARE heritage custodians due to statutory record-keeping obligations.
|
|
|
|
**Heritage Relevance Scores**:
|
|
| Role Category | Score Range |
|
|
|---------------|-------------|
|
|
| Records Management | 0.40-0.50 |
|
|
| IT/Systems (records) | 0.30-0.40 |
|
|
| Policy/Advisory | 0.25-0.35 |
|
|
| Administrative | 0.15-0.25 |
|
|
|
|
**See**: `.opencode/GOVERNMENT_MINISTRY_HERITAGE_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
33: """### Rule 33: GHCID Collision Duplicate Detection
|
|
|
|
🚨 **CRITICAL**: Duplicate detection is MANDATORY in GHCID collision resolution.
|
|
|
|
**Decision Matrix**:
|
|
- ALL details match → DUPLICATE (keep earliest, archive later)
|
|
- Same name, different city → NOT DUPLICATE (keep both, add suffix)
|
|
- Same name, same city, different Wikidata IDs → NOT DUPLICATE
|
|
- When in doubt → Keep both files (can merge later)
|
|
|
|
**See**: `.opencode/GHCID_COLLISION_DUPLICATE_DETECTION.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
|
|
34: """### Rule 34: Linkup is the Preferred Web Scraper
|
|
|
|
🚨 **CRITICAL**: Use Linkup as primary web scraper. Firecrawl credits are limited.
|
|
|
|
**Tool Priority**:
|
|
| Priority | Tool | When to Use |
|
|
|----------|------|-------------|
|
|
| 1st | `linkup_linkup-search` | General research, finding pages |
|
|
| 2nd | `linkup_linkup-fetch` | Fetching known URL |
|
|
| 3rd | `firecrawl_*` | Only when Linkup fails |
|
|
| 4th | `playwright_*` | Interactive pages, HTML archival |
|
|
|
|
**Two-Phase for XPath Provenance** (Rule 6 compliance):
|
|
1. Linkup for discovery
|
|
2. Playwright for archival with XPath extraction
|
|
|
|
**See**: `.opencode/LINKUP_PREFERRED_WEB_SCRAPER_RULE.md` for complete documentation
|
|
|
|
---
|
|
""",
|
|
}
|
|
|
|
|
|
def main():
|
|
agents_path = Path("/Users/kempersc/apps/glam/AGENTS.md")
|
|
output_path = Path("/Users/kempersc/apps/glam/AGENTS_OPTIMIZED.md")
|
|
|
|
# Read original file
|
|
with open(agents_path, "r", encoding="utf-8") as f:
|
|
original_lines = f.readlines()
|
|
|
|
print(f"Original file: {len(original_lines)} lines")
|
|
|
|
# Extract sections
|
|
header = original_lines[:22] # Lines 1-22 (0-indexed: 0-21)
|
|
unique_content = original_lines[2775:] # Lines 2776+ (0-indexed: 2775+)
|
|
|
|
print(f"Header: {len(header)} lines")
|
|
print(f"Unique content: {len(unique_content)} lines")
|
|
|
|
# Extract Rules 2-3 full content for appendix
|
|
rule2_start = None
|
|
rule4_start = None
|
|
|
|
for i, line in enumerate(original_lines):
|
|
if "### Rule 2: Wikidata Entities Are NOT Ontology Classes" in line:
|
|
rule2_start = i
|
|
elif "### Rule 4:" in line:
|
|
rule4_start = i
|
|
break
|
|
|
|
if rule2_start and rule4_start:
|
|
rules_2_3_content = original_lines[rule2_start:rule4_start]
|
|
print(f"Rules 2-3 full content: {len(rules_2_3_content)} lines")
|
|
else:
|
|
rules_2_3_content = []
|
|
print("Warning: Could not extract Rules 2-3 boundaries")
|
|
|
|
# Build output
|
|
output_lines = []
|
|
|
|
# Header
|
|
output_lines.extend(header)
|
|
output_lines.append("\n")
|
|
|
|
# Enhanced rules section
|
|
output_lines.append("## 🚨 CRITICAL RULES FOR ALL AGENTS\n\n")
|
|
output_lines.append("This section summarizes 35 critical rules. Each rule has complete documentation in `.opencode/` files.\n\n")
|
|
|
|
for rule_num in range(35):
|
|
output_lines.append(RULE_CONTENT[rule_num])
|
|
output_lines.append("\n")
|
|
|
|
# Add appendix for rules without .opencode files
|
|
if rules_2_3_content:
|
|
output_lines.append("## Appendix: Full Rule Content (No .opencode Equivalent)\n\n")
|
|
output_lines.append("The following rules have no separate .opencode file and are preserved in full:\n\n")
|
|
output_lines.extend([line if line.endswith("\n") else line + "\n" for line in rules_2_3_content])
|
|
output_lines.append("\n")
|
|
|
|
# Unique content (Project Overview onward)
|
|
output_lines.extend(unique_content)
|
|
|
|
# Write output
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.writelines(output_lines)
|
|
|
|
final_lines = len(output_lines)
|
|
print(f"\nOutput file: {final_lines} lines")
|
|
print(f"Reduction: {len(original_lines)} → {final_lines} ({len(original_lines) - final_lines} lines removed)")
|
|
print(f"Saved to: {output_path}")
|
|
|
|
return output_path
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|