Files
factbase-ancient-history/.automate/continuous-improve.sh

450 lines
16 KiB
Bash
Executable File

#!/usr/bin/env bash
# continuous-improve.sh — Continuous enrichment and quality loop
#
# Each cycle: processes every entity (resolve reviews, enrich from outside
# sources), then runs a deep cross-document validation scan.
#
# Usage: .automate/continuous-improve.sh [options]
# --priority reviews|stale|random Queue ordering (default: reviews)
# --cycle-delay N Seconds between entities (default: 5)
# --model MODEL LLM model (default: claude-sonnet-4.6)
# --start N Skip first N entities in queue (resume)
# --skip-unchanged Skip entities unchanged since last pass
set -euo pipefail
# ─── Parse arguments ───
PRIORITY="reviews"
CYCLE_DELAY=5
MODEL="claude-sonnet-4.6"
START_AT=0
SKIP_UNCHANGED=false
while [[ $# -gt 0 ]]; do
case "$1" in
--priority) PRIORITY="$2"; shift 2 ;;
--cycle-delay) CYCLE_DELAY="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--start) START_AT="$2"; shift 2 ;;
--skip-unchanged) SKIP_UNCHANGED=true; shift ;;
*) echo "Usage: $0 [--priority reviews|stale|random] [--cycle-delay N] [--model MODEL] [--start N] [--skip-unchanged]"; exit 1 ;;
esac
done
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DOCS_DIR="$SCRIPT_DIR/.."
STATE_FILE="$SCRIPT_DIR/.improve-state.tsv"
ACTION_LOG="$SCRIPT_DIR/improve-history.log"
exec > >(tee -a "$SCRIPT_DIR/continuous-improve.log") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
do_commit() {
local msg="$1"
cd "$DOCS_DIR"
git add -A
if ! git diff --cached --quiet; then
git commit -m "$msg"
for attempt in 1 2 3; do
if git push 2>/dev/null; then
log "✅ Committed: $msg"
return 0
fi
log "⚠️ Push attempt $attempt failed, rebasing..."
git pull --rebase
done
log "❌ Push failed after 3 attempts"
fi
}
build_queue() {
cd "$DOCS_DIR"
local tmpfile
tmpfile=$(mktemp)
find . -name '*.md' \
-not -path './.git/*' \
-not -path './.automate/*' \
-not -path './.kiro/*' \
-not -path './.factbase/*' \
-not -path './_orphans.md' \
-print0 | while IFS= read -r -d '' file; do
local fb_id
fb_id=$(grep -oP '(?<=factbase:)[a-f0-9]{6}' "$file" 2>/dev/null | head -1)
[[ -z "$fb_id" ]] && continue
local review_count
review_count=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
local mtime
mtime=$(stat -c %Y "$file")
local last_processed=0
if [[ -f "$STATE_FILE" ]]; then
last_processed=$(grep "^${fb_id} " "$STATE_FILE" 2>/dev/null | tail -1 | cut -f2) || true
[[ -z "$last_processed" ]] && last_processed=0
fi
local garbage_count
garbage_count=$(grep -ciP '^\[\^.*\b(not a conflict|sequential|boundary overlap|not simultaneous|malformed tag|garbled|artifact|remove)\b' "$file" 2>/dev/null) || true
# Flag ruler docs with incomplete names (single word, alias, no space)
local incomplete_name=0
local parent_dir
parent_dir=$(echo "$file" | sed 's|^\./||' | rev | cut -d/ -f2 | rev)
if [[ "$parent_dir" == "rulers" ]]; then
local doc_title
doc_title=$(grep '^# ' "$file" 2>/dev/null | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
if [[ -n "$doc_title" ]] && ! echo "$doc_title" | grep -q ' '; then
incomplete_name=1
fi
fi
echo -e "${fb_id}\t${file}\t${review_count}\t${mtime}\t${last_processed}\t${garbage_count}\t${incomplete_name}"
done > "$tmpfile"
case "$PRIORITY" in
reviews) sort -t$'\t' -k7,7rn -k3,3rn -k6,6rn -k5,5n "$tmpfile" ;;
stale) sort -t$'\t' -k7,7rn -k5,5n -k3,3rn "$tmpfile" ;;
random) shuf "$tmpfile" ;;
esac
rm -f "$tmpfile"
}
mark_processed() {
echo -e "$1\t$(date +%s)" >> "$STATE_FILE"
}
# ─── Bash-based mechanical cleanup (no agent needed) ───
bash_cleanup() {
local file="$1"
local changed=false
# Fix corrupted title: strip @t[...] and [^N] suffixes
if grep -qP '^# .+(\s+@t\[|\s+\[\^)' "$file" 2>/dev/null; then
sed -i -E 's/^(# .+?)\s+(@t\[.*|\[\^.*)$/\1/' "$file"
changed=true
fi
# Delete garbage footnotes (review answers dumped as source citations)
if grep -qiP '^\[\^\d+\]:.*\b(not a conflict|sequential role|boundary overlap|not simultaneous|malformed tag|garbled|artifact from previous|remove garbled)\b' "$file" 2>/dev/null; then
# Get line numbers of garbage footnotes, delete them
local lines
lines=$(grep -niP '^\[\^\d+\]:.*\b(not a conflict|sequential role|boundary overlap|not simultaneous|malformed tag|garbled|artifact from previous|remove garbled)\b' "$file" | cut -d: -f1 | sort -rn)
if [[ -n "$lines" ]]; then
for ln in $lines; do
sed -i "${ln}d" "$file"
done
changed=true
fi
fi
# Remove review answer artifact lines in body
if grep -qP '^- Artifact from previous review application' "$file" 2>/dev/null; then
sed -i '/^- Artifact from previous review application/d' "$file"
changed=true
fi
# Remove empty Review Queue sections and factbase:review markers
if grep -qP '^## Review Queue|<!-- factbase:review -->' "$file" 2>/dev/null; then
# Only remove if the review queue has no actual unanswered questions
local has_open
has_open=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
if [[ "$has_open" -eq 0 ]]; then
sed -i '/^## Review Queue$/d; /^<!-- factbase:review -->$/d' "$file"
# Clean up trailing blank lines and stray --- at end of file
sed -i -e :a -e '/^\n*$/{$d;N;ba' -e '}' "$file"
changed=true
fi
fi
[[ "$changed" == true ]]
}
get_related_context() {
local file="$1"
local title
title=$(grep '^# ' "$file" | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
[[ -z "$title" ]] && return
cd "$DOCS_DIR"
local related=""
while IFS= read -r match_file; do
[[ "$match_file" == "$file" ]] && continue
local match_title
match_title=$(grep '^# ' "$match_file" 2>/dev/null | head -1 | sed 's/^# //')
local excerpt
excerpt=$(grep -i "$title" "$match_file" 2>/dev/null | head -3)
if [[ -n "$excerpt" ]]; then
related+="--- From: ${match_title} (${match_file}) ---
${excerpt}
"
fi
done < <(grep -rl "$title" . --include='*.md' \
| grep -v '.git\|.automate\|.kiro\|.factbase\|_orphans' \
| head -5)
echo "$related"
}
# ─── Agent-based processing (review questions + enrichment) ───
process_entity_agent() {
local fb_id="$1"
local file="$2"
local title="$3"
cd "$DOCS_DIR"
local content
content=$(cat "$file")
local related
related=$(get_related_context "$file")
local prompt
prompt="You are continuously improving a factbase knowledge base, one entity at a time.
Use factbase MCP tools — call get_authoring_guide if you need formatting rules.
ENTITY FILE: $file
ENTITY ID: $fb_id
=== CURRENT DOCUMENT CONTENT ===
$content
=== END DOCUMENT CONTENT ==="
if [[ -n "$related" ]]; then
prompt+="
=== MENTIONS IN OTHER DOCUMENTS ===
$related
=== END MENTIONS ==="
fi
prompt+='
STEPS — work through in order, skip any that do not apply:
1. RESOLVE REVIEW QUESTIONS:
Call get_review_queue(doc_id='"'"''"$fb_id"''"'"') — if there are open questions, answer them.
Patterns learned from resolving thousands of these:
- CONFLICT (chronological overlaps): Boundary-year overlaps in sequential reigns or periods
are NOT conflicts (date granularity artifact). Concurrent roles (e.g., ruler + military
commander, pharaoh + high priest) are both true simultaneously. Approximate dates that
overlap by a few years reflect scholarly uncertainty, not contradiction.
- AMBIGUOUS (terms): Expand the term AND create or update a definitions/ file so the
term is not flagged again. Do NOT just answer inline — the definitions file is what
prevents recurrence. Check existing definitions files first with search_content.
Common: BCE, CE, polis, satrapy, pharaoh, consul, tribune, cuneiform, stele, ziggurat.
- TEMPORAL/STALE: BCE dates are written in text, not temporal tags. CE date ranges
(@t[...]) that are closed are historical, not stale. For open @t[~...] tags, search
for newer scholarship.
- MISSING: Search with search_knowledge and search_content. If not found, defer.
After answering, call apply_review_answers(doc_id='"'"''"$fb_id"''"'"'). Then re-read the
file with get_entity and verify apply did not corrupt it (garbage footnotes, mangled title).
If it did, fix with update_document.
2. IDENTITY & ORGANIZATION:
For ruler documents: if the title is a single name, alias, or epithet (not a full name),
prioritize finding their full or commonly known name. Search local sources, check mentions
in other documents, and cross-reference with civilization docs.
If you find the full name, update the document title with update_document.
For any document: if the title or file location could be improved (e.g. a ruler doc is in
the wrong civilization folder, or the filename does not match the title), use the organize MCP
tool to rename/move it. Use organize(action='"'"'move'"'"', doc_id=..., to=...) to relocate
or update_document(id=..., title=...) to fix the title.
3. ENRICH FROM OUTSIDE SOURCES:
This is the most important step. Use web_search to find high-quality information about
this entity from scholarly and encyclopedic sources. Search for:
- The entity name + "archaeology" or "ancient history"
- Key events, dates, or relationships mentioned in the document
- Recent archaeological discoveries or revised scholarly consensus
Prefer peer-reviewed sources, university publications, museum databases, and established
encyclopedias. Add any new facts not already present, with source citations, following
factbase authoring conventions. Do NOT add speculative or poorly-sourced claims.
4. IMPROVEMENT IDEAS:
If you notice friction or gaps in factbase tools, file a Vikunja feature request:
curl -s -X PUT "https://vikunja.home.everyonce.com/api/v1/projects/2/tasks" \
-H "Authorization: Bearer tk_ff251f3d3512775c71913bc2f8ec0dabbf5016a8" \
-H "Content-Type: application/json" \
-d '"'"'{"title":"[factbase][feature] <summary>","description":"<details>","priority":2}'"'"'
Only file genuinely useful improvements, not duplicates.
RULES:
- Use update_document to edit — be surgical, change only what needs changing
- If nothing needs changing, say so and move on
IMPORTANT: When finished, output exactly one line:
<action_summary>status: STATUS | '"$title"' | changes: DESCRIPTION</action_summary>
Status values: UPDATED (made changes), NO_CHANGE (nothing to do), ERROR (something failed)'
local output
output=$(kiro-cli chat --trust-all-tools --no-interactive --model "$MODEL" \
"$prompt" 2>&1) || {
log "❌ kiro-cli error for $title, continuing..."
return 1
}
echo "$output"
local summary
summary=$(echo "$output" | grep -oP '(?<=<action_summary>).*(?=</action_summary>)' | tail -1)
[[ -z "$summary" ]] && summary="status: UNKNOWN | $title | no summary returned"
echo "$summary"
}
# ─── Main entity processing dispatcher ───
process_entity() {
local fb_id="$1"
local file="$2"
local review_count="$3"
local mtime="$4"
local last_processed="$5"
local garbage_count="$6"
local incomplete_name="$7"
cd "$DOCS_DIR"
local title
title=$(grep '^# ' "$file" | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
log "━━━ [$title] ($fb_id) reviews=$review_count garbage=$garbage_count ━━━"
local start_time
start_time=$(date +%s)
local status="NO_CHANGE"
local summary=""
# Phase 1: Bash cleanup (milliseconds, no agent)
if bash_cleanup "$file"; then
log " 🧹 Bash cleanup applied"
status="UPDATED"
# Recount after cleanup
review_count=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
garbage_count=0
fi
# Phase 2: Decide if agent is needed
local needs_agent=true
if [[ "${incomplete_name:-0}" -eq 1 ]]; then
log " 👤 Incomplete name (ruler doc) → agent needed to resolve identity"
elif [[ "$review_count" -gt 0 ]]; then
log " 📋 $review_count review questions → agent needed"
elif [[ "$SKIP_UNCHANGED" == true && "$last_processed" -gt 0 && "$mtime" -le "$last_processed" ]]; then
needs_agent=false
log " ⏭️ No questions, not modified since last pass → skipping (--skip-unchanged)"
else
log " 🔍 Enrichment + review pass"
fi
if [[ "$needs_agent" == true ]]; then
local agent_output
agent_output=$(process_entity_agent "$fb_id" "$file" "$title")
echo "$agent_output"
summary=$(echo "$agent_output" | grep -oP '(?<=<action_summary>).*(?=</action_summary>)' | tail -1)
local agent_status
agent_status=$(echo "$summary" | grep -oP '^status: \K[A-Z_]+' || echo "UNKNOWN")
if [[ "$agent_status" == "UPDATED" ]]; then
status="UPDATED"
fi
fi
local end_time
end_time=$(date +%s)
local duration=$((end_time - start_time))
if [[ "$status" == "UPDATED" ]]; then
do_commit "improve: $title"
fi
[[ -z "$summary" ]] && summary="status: $status | $title | bash-only pass"
{
echo "[$(date -Iseconds)] $fb_id | $title"
echo " $summary"
echo " duration: ${duration}s"
} >> "$ACTION_LOG"
mark_processed "$fb_id"
log " Done (${duration}s) — $status"
[[ "$status" == "UPDATED" ]] && return 0 || return 1
}
# ═══════════════════════════════════════════
# DEEP CROSS-DOCUMENT SCAN (once per cycle)
# ═══════════════════════════════════════════
run_deep_scan() {
log "🔬 Running deep cross-document validation scan..."
local output
output=$(kiro-cli chat --trust-all-tools --no-interactive --model "$MODEL" \
"Run check_repository with deep_check=true. Review any new issues found — answer what you can, defer what you cannot. Then commit." 2>&1) || {
log "❌ Deep scan agent failed, continuing..."
return 1
}
echo "$output"
do_commit "deep scan: cross-document validation"
log "✅ Deep scan complete"
}
# ═══════════════════════════════════════════
# MAIN LOOP
# ═══════════════════════════════════════════
log "🚀 Starting continuous improvement loop (priority=$PRIORITY, model=$MODEL, start=$START_AT, skip_unchanged=$SKIP_UNCHANGED)"
log "Docs dir: $DOCS_DIR"
log "State file: $STATE_FILE"
log "Press Ctrl+C to stop"
PASS=0
while true; do
PASS=$((PASS + 1))
log ""
log "═══════════════════════════════════════════"
log " PASS $PASS$(TZ='America/Chicago' date '+%Y-%m-%d %r') — priority=$PRIORITY"
log "═══════════════════════════════════════════"
QUEUE=$(build_queue)
TOTAL=$(echo "$QUEUE" | grep -c . || echo 0)
log "Queue: $TOTAL entities (starting at $((START_AT + 1)))"
PROCESSED=0
UPDATED=0
SKIPPED=0
POSITION=0
while IFS=$'\t' read -r fb_id file review_count mtime last_processed garbage_count incomplete_name; do
[[ -z "$fb_id" ]] && continue
POSITION=$((POSITION + 1))
if [[ $POSITION -le $START_AT ]]; then
continue
fi
PROCESSED=$((PROCESSED + 1))
log "[$POSITION/$TOTAL] Next up..."
if process_entity "$fb_id" "$file" "$review_count" "$mtime" "$last_processed" "$garbage_count" "$incomplete_name"; then
UPDATED=$((UPDATED + 1))
fi
sleep "$CYCLE_DELAY"
done <<< "$QUEUE"
log ""
log "═══ Pass $PASS complete: $PROCESSED processed, $UPDATED updated ═══"
run_deep_scan
START_AT=0
log "Looping back to start..."
sleep "$CYCLE_DELAY"
done