#!/usr/bin/env bash
# continuous-improve.sh — Continuous enrichment and quality loop
#
# Each cycle: processes every entity (resolve reviews, enrich from outside
# sources), then runs a deep cross-document validation scan.
#
# Usage: .automate/continuous-improve.sh [options]
#   --priority reviews|stale|random   Queue ordering (default: reviews)
#   --cycle-delay N                   Seconds between entities (default: 5)
#   --model MODEL                     LLM model (default: claude-sonnet-4.6)
#   --start N                         Skip first N entities in queue (resume)
#   --skip-unchanged                  Skip entities unchanged since last pass

set -euo pipefail

# ─── Parse arguments ───
PRIORITY="reviews"
CYCLE_DELAY=5
MODEL="claude-sonnet-4.6"
START_AT=0
SKIP_UNCHANGED=false
while [[ $# -gt 0 ]]; do
    case "$1" in
        --priority) PRIORITY="$2"; shift 2 ;;
        --cycle-delay) CYCLE_DELAY="$2"; shift 2 ;;
        --model) MODEL="$2"; shift 2 ;;
        --start) START_AT="$2"; shift 2 ;;
        --skip-unchanged) SKIP_UNCHANGED=true; shift ;;
        *) echo "Usage: $0 [--priority reviews|stale|random] [--cycle-delay N] [--model MODEL] [--start N] [--skip-unchanged]"; exit 1 ;;
    esac
done

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DOCS_DIR="$SCRIPT_DIR/.."
STATE_FILE="$SCRIPT_DIR/.improve-state.tsv"
ACTION_LOG="$SCRIPT_DIR/improve-history.log"

exec > >(tee -a "$SCRIPT_DIR/continuous-improve.log") 2>&1

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }

do_commit() {
    local msg="$1"
    cd "$DOCS_DIR"
    git add -A
    if ! git diff --cached --quiet; then
        git commit -m "$msg"
        for attempt in 1 2 3; do
            if git push 2>/dev/null; then
                log "✅ Committed: $msg"
                return 0
            fi
            log "⚠️  Push attempt $attempt failed, rebasing..."
            git pull --rebase
        done
        log "❌ Push failed after 3 attempts"
    fi
}

build_queue() {
    cd "$DOCS_DIR"
    local tmpfile
    tmpfile=$(mktemp)

    find . -name '*.md' \
        -not -path './.git/*' \
        -not -path './.automate/*' \
        -not -path './.kiro/*' \
        -not -path './.factbase/*' \
        -not -path './_orphans.md' \
        -print0 | while IFS= read -r -d '' file; do

        local fb_id
        fb_id=$(grep -oP '(?<=factbase:)[a-f0-9]{6}' "$file" 2>/dev/null | head -1)
        [[ -z "$fb_id" ]] && continue

        local review_count
        review_count=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true

        local mtime
        mtime=$(stat -c %Y "$file")

        local last_processed=0
        if [[ -f "$STATE_FILE" ]]; then
            last_processed=$(grep "^${fb_id}	" "$STATE_FILE" 2>/dev/null | tail -1 | cut -f2) || true
            [[ -z "$last_processed" ]] && last_processed=0
        fi

        local garbage_count
        garbage_count=$(grep -ciP '^\[\^.*\b(not a conflict|sequential|boundary overlap|not simultaneous|malformed tag|garbled|artifact|remove)\b' "$file" 2>/dev/null) || true

        # Flag ruler docs with incomplete names (single word, alias, no space)
        local incomplete_name=0
        local parent_dir
        parent_dir=$(echo "$file" | sed 's|^\./||' | rev | cut -d/ -f2 | rev)
        if [[ "$parent_dir" == "rulers" ]]; then
            local doc_title
            doc_title=$(grep '^# ' "$file" 2>/dev/null | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
            if [[ -n "$doc_title" ]] && ! echo "$doc_title" | grep -q ' '; then
                incomplete_name=1
            fi
        fi

        echo -e "${fb_id}\t${file}\t${review_count}\t${mtime}\t${last_processed}\t${garbage_count}\t${incomplete_name}"
    done > "$tmpfile"

    case "$PRIORITY" in
        reviews) sort -t$'\t' -k7,7rn -k3,3rn -k6,6rn -k5,5n "$tmpfile" ;;
        stale)   sort -t$'\t' -k7,7rn -k5,5n -k3,3rn "$tmpfile" ;;
        random)  shuf "$tmpfile" ;;
    esac

    rm -f "$tmpfile"
}

mark_processed() {
    echo -e "$1\t$(date +%s)" >> "$STATE_FILE"
}

# ─── Bash-based mechanical cleanup (no agent needed) ───
bash_cleanup() {
    local file="$1"
    local changed=false

    # Fix corrupted title: strip @t[...] and [^N] suffixes
    if grep -qP '^# .+(\s+@t\[|\s+\[\^)' "$file" 2>/dev/null; then
        sed -i -E 's/^(# .+?)\s+(@t\[.*|\[\^.*)$/\1/' "$file"
        changed=true
    fi

    # Delete garbage footnotes (review answers dumped as source citations)
    if grep -qiP '^\[\^\d+\]:.*\b(not a conflict|sequential role|boundary overlap|not simultaneous|malformed tag|garbled|artifact from previous|remove garbled)\b' "$file" 2>/dev/null; then
        # Get line numbers of garbage footnotes, delete them
        local lines
        lines=$(grep -niP '^\[\^\d+\]:.*\b(not a conflict|sequential role|boundary overlap|not simultaneous|malformed tag|garbled|artifact from previous|remove garbled)\b' "$file" | cut -d: -f1 | sort -rn)
        if [[ -n "$lines" ]]; then
            for ln in $lines; do
                sed -i "${ln}d" "$file"
            done
            changed=true
        fi
    fi

    # Remove review answer artifact lines in body
    if grep -qP '^- Artifact from previous review application' "$file" 2>/dev/null; then
        sed -i '/^- Artifact from previous review application/d' "$file"
        changed=true
    fi

    # Remove empty Review Queue sections and factbase:review markers
    if grep -qP '^## Review Queue|<!-- factbase:review -->' "$file" 2>/dev/null; then
        # Only remove if the review queue has no actual unanswered questions
        local has_open
        has_open=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
        if [[ "$has_open" -eq 0 ]]; then
            sed -i '/^## Review Queue$/d; /^<!-- factbase:review -->$/d' "$file"
            # Clean up trailing blank lines and stray --- at end of file
            sed -i -e :a -e '/^\n*$/{$d;N;ba' -e '}' "$file"
            changed=true
        fi
    fi

    [[ "$changed" == true ]]
}

get_related_context() {
    local file="$1"
    local title
    title=$(grep '^# ' "$file" | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
    [[ -z "$title" ]] && return

    cd "$DOCS_DIR"
    local related=""
    while IFS= read -r match_file; do
        [[ "$match_file" == "$file" ]] && continue
        local match_title
        match_title=$(grep '^# ' "$match_file" 2>/dev/null | head -1 | sed 's/^# //')
        local excerpt
        excerpt=$(grep -i "$title" "$match_file" 2>/dev/null | head -3)
        if [[ -n "$excerpt" ]]; then
            related+="--- From: ${match_title} (${match_file}) ---
${excerpt}

"
        fi
    done < <(grep -rl "$title" . --include='*.md' \
        | grep -v '.git\|.automate\|.kiro\|.factbase\|_orphans' \
        | head -5)

    echo "$related"
}

# ─── Agent-based processing (review questions + enrichment) ───
process_entity_agent() {
    local fb_id="$1"
    local file="$2"
    local title="$3"

    cd "$DOCS_DIR"

    local content
    content=$(cat "$file")

    local related
    related=$(get_related_context "$file")

    local prompt
    prompt="You are continuously improving a factbase knowledge base, one entity at a time.
Use factbase MCP tools — call get_authoring_guide if you need formatting rules.

ENTITY FILE: $file
ENTITY ID: $fb_id

=== CURRENT DOCUMENT CONTENT ===
$content
=== END DOCUMENT CONTENT ==="

    if [[ -n "$related" ]]; then
        prompt+="

=== MENTIONS IN OTHER DOCUMENTS ===
$related
=== END MENTIONS ==="
    fi

    prompt+='

STEPS — work through in order, skip any that do not apply:

1. RESOLVE REVIEW QUESTIONS:
   Call get_review_queue(doc_id='"'"''"$fb_id"''"'"') — if there are open questions, answer them.

   Patterns learned from resolving thousands of these:
   - CONFLICT (chronological overlaps): Boundary-year overlaps in sequential reigns or periods
     are NOT conflicts (date granularity artifact). Concurrent roles (e.g., ruler + military
     commander, pharaoh + high priest) are both true simultaneously. Approximate dates that
     overlap by a few years reflect scholarly uncertainty, not contradiction.
   - AMBIGUOUS (terms): Expand the term AND create or update a definitions/ file so the
     term is not flagged again. Do NOT just answer inline — the definitions file is what
     prevents recurrence. Check existing definitions files first with search_content.
     Common: BCE, CE, polis, satrapy, pharaoh, consul, tribune, cuneiform, stele, ziggurat.
   - TEMPORAL/STALE: BCE dates are written in text, not temporal tags. CE date ranges
     (@t[...]) that are closed are historical, not stale. For open @t[~...] tags, search
     for newer scholarship.
   - MISSING: Search with search_knowledge and search_content. If not found, defer.

   After answering, call apply_review_answers(doc_id='"'"''"$fb_id"''"'"'). Then re-read the
   file with get_entity and verify apply did not corrupt it (garbage footnotes, mangled title).
   If it did, fix with update_document.

2. IDENTITY & ORGANIZATION:
   For ruler documents: if the title is a single name, alias, or epithet (not a full name),
   prioritize finding their full or commonly known name. Search local sources, check mentions
   in other documents, and cross-reference with civilization docs.
   If you find the full name, update the document title with update_document.

   For any document: if the title or file location could be improved (e.g. a ruler doc is in
   the wrong civilization folder, or the filename does not match the title), use the organize MCP
   tool to rename/move it. Use organize(action='"'"'move'"'"', doc_id=..., to=...) to relocate
   or update_document(id=..., title=...) to fix the title.

3. ENRICH FROM OUTSIDE SOURCES:
   This is the most important step. Use web_search to find high-quality information about
   this entity from scholarly and encyclopedic sources. Search for:
   - The entity name + "archaeology" or "ancient history"
   - Key events, dates, or relationships mentioned in the document
   - Recent archaeological discoveries or revised scholarly consensus
   Prefer peer-reviewed sources, university publications, museum databases, and established
   encyclopedias. Add any new facts not already present, with source citations, following
   factbase authoring conventions. Do NOT add speculative or poorly-sourced claims.

4. IMPROVEMENT IDEAS:
   If you notice friction or gaps in factbase tools, file a Vikunja feature request:
   curl -s -X PUT "https://vikunja.home.everyonce.com/api/v1/projects/2/tasks" \
     -H "Authorization: Bearer tk_ff251f3d3512775c71913bc2f8ec0dabbf5016a8" \
     -H "Content-Type: application/json" \
     -d '"'"'{"title":"[factbase][feature] <summary>","description":"<details>","priority":2}'"'"'
   Only file genuinely useful improvements, not duplicates.

RULES:
- Use update_document to edit — be surgical, change only what needs changing
- If nothing needs changing, say so and move on

IMPORTANT: When finished, output exactly one line:
<action_summary>status: STATUS | '"$title"' | changes: DESCRIPTION</action_summary>
Status values: UPDATED (made changes), NO_CHANGE (nothing to do), ERROR (something failed)'

    local output
    output=$(kiro-cli chat --trust-all-tools --no-interactive --model "$MODEL" \
        "$prompt" 2>&1) || {
        log "❌ kiro-cli error for $title, continuing..."
        return 1
    }

    echo "$output"

    local summary
    summary=$(echo "$output" | grep -oP '(?<=<action_summary>).*(?=</action_summary>)' | tail -1)
    [[ -z "$summary" ]] && summary="status: UNKNOWN | $title | no summary returned"
    echo "$summary"
}

# ─── Main entity processing dispatcher ───
process_entity() {
    local fb_id="$1"
    local file="$2"
    local review_count="$3"
    local mtime="$4"
    local last_processed="$5"
    local garbage_count="$6"
    local incomplete_name="$7"

    cd "$DOCS_DIR"

    local title
    title=$(grep '^# ' "$file" | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')

    log "━━━ [$title] ($fb_id) reviews=$review_count garbage=$garbage_count ━━━"

    local start_time
    start_time=$(date +%s)
    local status="NO_CHANGE"
    local summary=""

    # Phase 1: Bash cleanup (milliseconds, no agent)
    if bash_cleanup "$file"; then
        log "  🧹 Bash cleanup applied"
        status="UPDATED"
        # Recount after cleanup
        review_count=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
        garbage_count=0
    fi

    # Phase 2: Decide if agent is needed
    local needs_agent=true
    if [[ "${incomplete_name:-0}" -eq 1 ]]; then
        log "  👤 Incomplete name (ruler doc) → agent needed to resolve identity"
    elif [[ "$review_count" -gt 0 ]]; then
        log "  📋 $review_count review questions → agent needed"
    elif [[ "$SKIP_UNCHANGED" == true && "$last_processed" -gt 0 && "$mtime" -le "$last_processed" ]]; then
        needs_agent=false
        log "  ⏭️  No questions, not modified since last pass → skipping (--skip-unchanged)"
    else
        log "  🔍 Enrichment + review pass"
    fi

    if [[ "$needs_agent" == true ]]; then
        local agent_output
        agent_output=$(process_entity_agent "$fb_id" "$file" "$title")
        echo "$agent_output"

        summary=$(echo "$agent_output" | grep -oP '(?<=<action_summary>).*(?=</action_summary>)' | tail -1)
        local agent_status
        agent_status=$(echo "$summary" | grep -oP '^status: \K[A-Z_]+' || echo "UNKNOWN")
        if [[ "$agent_status" == "UPDATED" ]]; then
            status="UPDATED"
        fi
    fi

    local end_time
    end_time=$(date +%s)
    local duration=$((end_time - start_time))

    if [[ "$status" == "UPDATED" ]]; then
        do_commit "improve: $title"
    fi

    [[ -z "$summary" ]] && summary="status: $status | $title | bash-only pass"

    {
        echo "[$(date -Iseconds)] $fb_id | $title"
        echo "  $summary"
        echo "  duration: ${duration}s"
    } >> "$ACTION_LOG"

    mark_processed "$fb_id"
    log "  Done (${duration}s) — $status"

    [[ "$status" == "UPDATED" ]] && return 0 || return 1
}

# ═══════════════════════════════════════════
# DEEP CROSS-DOCUMENT SCAN (once per cycle)
# ═══════════════════════════════════════════
run_deep_scan() {
    log "🔬 Running deep cross-document validation scan..."
    local output
    output=$(kiro-cli chat --trust-all-tools --no-interactive --model "$MODEL" \
        "Run check_repository with deep_check=true. Review any new issues found — answer what you can, defer what you cannot. Then commit." 2>&1) || {
        log "❌ Deep scan agent failed, continuing..."
        return 1
    }
    echo "$output"
    do_commit "deep scan: cross-document validation"
    log "✅ Deep scan complete"
}

# ═══════════════════════════════════════════
# MAIN LOOP
# ═══════════════════════════════════════════
log "🚀 Starting continuous improvement loop (priority=$PRIORITY, model=$MODEL, start=$START_AT, skip_unchanged=$SKIP_UNCHANGED)"
log "Docs dir: $DOCS_DIR"
log "State file: $STATE_FILE"
log "Press Ctrl+C to stop"

PASS=0
while true; do
    PASS=$((PASS + 1))
    log ""
    log "═══════════════════════════════════════════"
    log "  PASS $PASS — $(TZ='America/Chicago' date '+%Y-%m-%d %r') — priority=$PRIORITY"
    log "═══════════════════════════════════════════"

    QUEUE=$(build_queue)
    TOTAL=$(echo "$QUEUE" | grep -c . || echo 0)
    log "Queue: $TOTAL entities (starting at $((START_AT + 1)))"

    PROCESSED=0
    UPDATED=0
    SKIPPED=0
    POSITION=0

    while IFS=$'\t' read -r fb_id file review_count mtime last_processed garbage_count incomplete_name; do
        [[ -z "$fb_id" ]] && continue
        POSITION=$((POSITION + 1))

        if [[ $POSITION -le $START_AT ]]; then
            continue
        fi

        PROCESSED=$((PROCESSED + 1))
        log "[$POSITION/$TOTAL] Next up..."

        if process_entity "$fb_id" "$file" "$review_count" "$mtime" "$last_processed" "$garbage_count" "$incomplete_name"; then
            UPDATED=$((UPDATED + 1))
        fi

        sleep "$CYCLE_DELAY"
    done <<< "$QUEUE"

    log ""
    log "═══ Pass $PASS complete: $PROCESSED processed, $UPDATED updated ═══"

    run_deep_scan

    START_AT=0
    log "Looping back to start..."
    sleep "$CYCLE_DELAY"
done