Files
factbase-ancient-history/.automate/continuous-improve.sh

507 lines
20 KiB
Bash
Executable File

#!/usr/bin/env bash
# continuous-improve.sh — Continuous enrichment and quality loop
#
# Each cycle: processes every entity (resolve reviews, enrich from outside
# sources), then runs a deep cross-document validation scan.
#
# Usage: .automate/continuous-improve.sh [options]
# --priority reviews|stale|random Queue ordering (default: reviews)
# --cycle-delay N Seconds between entities (default: 5)
# --model MODEL LLM model (default: claude-sonnet-4.6)
# --start N Skip first N entities in queue (resume)
# --skip-unchanged Skip entities unchanged since last pass
set -euo pipefail
# ─── Parse arguments ───
PRIORITY="reviews"
CYCLE_DELAY=5
MODEL="claude-sonnet-4.6"
START_AT=0
SKIP_UNCHANGED=true
while [[ $# -gt 0 ]]; do
case "$1" in
--priority) PRIORITY="$2"; shift 2 ;;
--cycle-delay) CYCLE_DELAY="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--start) START_AT="$2"; shift 2 ;;
--skip-unchanged) SKIP_UNCHANGED=true; shift ;;
*) echo "Usage: $0 [--priority reviews|stale|random] [--cycle-delay N] [--model MODEL] [--start N] [--skip-unchanged]"; exit 1 ;;
esac
done
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DOCS_DIR="$SCRIPT_DIR/.."
STATE_FILE="$SCRIPT_DIR/.improve-state.tsv"
ACTION_LOG="$SCRIPT_DIR/improve-history.log"
exec > >(tee -a "$SCRIPT_DIR/continuous-improve.log") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
do_commit() {
local msg="$1"
cd "$DOCS_DIR"
git add -A
if ! git diff --cached --quiet; then
git commit -m "$msg"
for attempt in 1 2 3; do
if git push 2>/dev/null; then
log "✅ Committed: $msg"
return 0
fi
log "⚠️ Push attempt $attempt failed, rebasing..."
git pull --rebase
done
log "❌ Push failed after 3 attempts"
fi
}
build_queue() {
cd "$DOCS_DIR"
local tmpfile
tmpfile=$(mktemp)
find . -name '*.md' \
-not -path './.git/*' \
-not -path './.automate/*' \
-not -path './.kiro/*' \
-not -path './.factbase/*' \
-not -path './_orphans.md' \
-print0 | while IFS= read -r -d '' file; do
local fb_id
fb_id=$(grep -oP '(?<=factbase:)[a-f0-9]{6}' "$file" 2>/dev/null | head -1)
[[ -z "$fb_id" ]] && continue
local review_count
review_count=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
local mtime
mtime=$(stat -c %Y "$file")
local last_processed=0
if [[ -f "$STATE_FILE" ]]; then
last_processed=$(grep "^${fb_id} " "$STATE_FILE" 2>/dev/null | tail -1 | cut -f2) || true
[[ -z "$last_processed" ]] && last_processed=0
fi
local garbage_count
garbage_count=$(grep -ciP '^\[\^.*\b(not a conflict|sequential|boundary overlap|not simultaneous|malformed tag|garbled|artifact|remove)\b' "$file" 2>/dev/null) || true
# Flag ruler docs with incomplete names (single word, alias, no space)
local incomplete_name=0
local parent_dir
parent_dir=$(echo "$file" | sed 's|^\./||' | rev | cut -d/ -f2 | rev)
if [[ "$parent_dir" == "rulers" ]]; then
local doc_title
doc_title=$(grep '^# ' "$file" 2>/dev/null | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
if [[ -n "$doc_title" ]] && ! echo "$doc_title" | grep -q ' '; then
incomplete_name=1
fi
fi
echo -e "${fb_id}\t${file}\t${review_count}\t${mtime}\t${last_processed}\t${garbage_count}\t${incomplete_name}"
done > "$tmpfile"
case "$PRIORITY" in
reviews) sort -t$'\t' -k7,7rn -k3,3rn -k6,6rn -k5,5n "$tmpfile" ;;
stale) sort -t$'\t' -k7,7rn -k5,5n -k3,3rn "$tmpfile" ;;
random) shuf "$tmpfile" ;;
esac
rm -f "$tmpfile"
}
mark_processed() {
echo -e "$1\t$(date +%s)" >> "$STATE_FILE"
}
# ─── Bash-based mechanical cleanup (no agent needed) ───
bash_cleanup() {
local file="$1"
local changed=false
# Fix corrupted title: strip @t[...] and [^N] suffixes
if grep -qP '^# .+(\s+@t\[|\s+\[\^)' "$file" 2>/dev/null; then
sed -i -E 's/^(# .+?)\s+(@t\[.*|\[\^.*)$/\1/' "$file"
changed=true
fi
# Delete garbage footnotes (review answers dumped as source citations)
if grep -qiP '^\[\^\d+\]:.*\b(not a conflict|sequential role|boundary overlap|not simultaneous|malformed tag|garbled|artifact from previous|remove garbled)\b' "$file" 2>/dev/null; then
# Get line numbers of garbage footnotes, delete them
local lines
lines=$(grep -niP '^\[\^\d+\]:.*\b(not a conflict|sequential role|boundary overlap|not simultaneous|malformed tag|garbled|artifact from previous|remove garbled)\b' "$file" | cut -d: -f1 | sort -rn)
if [[ -n "$lines" ]]; then
for ln in $lines; do
sed -i "${ln}d" "$file"
done
changed=true
fi
fi
# Remove review answer artifact lines in body
if grep -qP '^- Artifact from previous review application' "$file" 2>/dev/null; then
sed -i '/^- Artifact from previous review application/d' "$file"
changed=true
fi
# Remove empty Review Queue sections and factbase:review markers
if grep -qP '^## Review Queue|<!-- factbase:review -->' "$file" 2>/dev/null; then
# Only remove if the review queue has no actual unanswered questions
local has_open
has_open=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
if [[ "$has_open" -eq 0 ]]; then
sed -i '/^## Review Queue$/d; /^<!-- factbase:review -->$/d' "$file"
# Clean up trailing blank lines and stray --- at end of file
sed -i -e :a -e '/^\n*$/{$d;N;ba' -e '}' "$file"
changed=true
fi
fi
[[ "$changed" == true ]]
}
get_related_context() {
local file="$1"
local title
title=$(grep '^# ' "$file" | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
[[ -z "$title" ]] && return
cd "$DOCS_DIR"
local related=""
while IFS= read -r match_file; do
[[ "$match_file" == "$file" ]] && continue
local match_title
match_title=$(grep '^# ' "$match_file" 2>/dev/null | head -1 | sed 's/^# //')
local excerpt
excerpt=$(grep -i "$title" "$match_file" 2>/dev/null | head -3)
if [[ -n "$excerpt" ]]; then
related+="--- From: ${match_title} (${match_file}) ---
${excerpt}
"
fi
done < <(grep -rl "$title" . --include='*.md' \
| grep -v '.git\|.automate\|.kiro\|.factbase\|_orphans' \
| head -5)
echo "$related"
}
# ─── Fetch existing Vikunja bug titles (for dedup) ───
get_existing_bug_titles() {
curl -s "https://vikunja.home.everyonce.com/api/v1/projects/2/tasks" \
-H "Authorization: Bearer tk_ff251f3d3512775c71913bc2f8ec0dabbf5016a8" \
2>/dev/null | grep -oP '"title":"[^"]*"' | sed 's/"title":"//;s/"$//' || true
}
# ─── Agent-based processing (review questions + enrichment) ───
process_entity_agent() {
local fb_id="$1"
local file="$2"
local title="$3"
local review_count="${4:-0}"
cd "$DOCS_DIR"
local content
content=$(cat "$file")
local related
related=$(get_related_context "$file")
local existing_bugs
existing_bugs=$(get_existing_bug_titles)
local prompt
prompt="You are continuously improving a factbase knowledge base, one entity at a time.
Use factbase MCP tools — call get_authoring_guide if you need formatting rules.
ENTITY FILE: $file
ENTITY ID: $fb_id
=== CURRENT DOCUMENT CONTENT ===
$content
=== END DOCUMENT CONTENT ==="
if [[ -n "$related" ]]; then
prompt+="
=== MENTIONS IN OTHER DOCUMENTS ===
$related
=== END MENTIONS ==="
fi
if [[ -n "$existing_bugs" ]]; then
prompt+="
=== EXISTING VIKUNJA BUG REPORTS (do NOT file duplicates) ===
$existing_bugs
=== END EXISTING BUGS ==="
fi
# Build review step conditionally
local review_step=""
if [[ "$review_count" -gt 0 ]]; then
review_step='1. RESOLVE REVIEW QUESTIONS:
Call get_review_queue(doc_id='"'"''"$fb_id"''"'"') — if there are open (unanswered) questions, answer them.
Patterns learned from resolving thousands of these:
- CONFLICT (chronological overlaps): Boundary-year overlaps in sequential reigns or periods
are NOT conflicts (date granularity artifact). Concurrent roles (e.g., ruler + military
commander, pharaoh + high priest) are both true simultaneously. Approximate dates that
overlap by a few years reflect scholarly uncertainty, not contradiction.
- AMBIGUOUS (terms): Expand the term AND create or update a definitions/ file so the
term is not flagged again. Do NOT just answer inline — the definitions file is what
prevents recurrence. Check existing definitions files first with search_content.
Common: BCE, CE, polis, satrapy, pharaoh, consul, tribune, cuneiform, stele, ziggurat.
- TEMPORAL/STALE: BCE dates are written in text, not temporal tags. CE date ranges
(@t[...]) that are closed are historical, not stale. For open @t[~...] tags, search
for newer scholarship.
- MISSING: Search with search_knowledge and search_content. If not found, defer.
IMPORTANT: Only call apply_review_answers if get_review_queue shows unanswered > 0.
After applying, re-read the file with get_entity and verify apply did not corrupt it
(garbage footnotes, mangled title). If it did, fix with update_document.'
else
review_step='1. REVIEW QUESTIONS: SKIP — the wrapper already confirmed 0 open review questions for this entity.'
fi
prompt+='
STEPS — work through in order, skip any that do not apply:
'"$review_step"'
2. IDENTITY & ORGANIZATION:
For ruler documents: if the title is a single name, alias, or epithet (not a full name),
prioritize finding their full or commonly known name. Search local sources, check mentions
in other documents, and cross-reference with civilization docs.
If you find the full name, update the document title with update_document.
For any document: if the title or file location could be improved (e.g. a ruler doc is in
the wrong civilization folder, or the filename does not match the title), use the organize MCP
tool to rename/move it. Use organize(action='"'"'move'"'"', doc_id=..., to=...) to relocate
or update_document(id=..., title=...) to fix the title.
3. ENRICH FROM OUTSIDE SOURCES:
This is the most important step. Use web_search to find high-quality information about
this entity from scholarly and encyclopedic sources. Search for:
- The entity name + "archaeology" or "ancient history"
- Key events, dates, or relationships mentioned in the document
- Recent archaeological discoveries or revised scholarly consensus
Prefer peer-reviewed sources, university publications, museum databases, and established
encyclopedias. Add any new facts not already present, with source citations, following
factbase authoring conventions. Do NOT add speculative or poorly-sourced claims.
4. FACTBASE TOOL BUGS & FEATURE REQUESTS:
A key purpose of this improvement loop is to make factbase itself better. As you work
through steps 1-3, pay close attention to how the factbase tools behave. File a Vikunja
task for EACH of the following you encounter:
BUGS — any MCP tool error, unexpected result, silent failure, data corruption, or behavior
that contradicts the tool description. Use title prefix "[factbase]".
FEATURES — any friction, missing capability, awkward multi-step workaround, or gap that
slowed you down or limited what you could accomplish. Use title prefix "[factbase][feature]".
CRITICAL: Before filing ANY bug or feature request, check the EXISTING VIKUNJA BUG REPORTS
section above. If a similar issue is already filed, do NOT file a duplicate. Only file if
the issue is genuinely new.
CRITICAL: Factbase is domain-agnostic — it is used for many different knowledge bases, not
just this one. Your bug reports and feature requests MUST describe the issue in terms of
factbase tool behavior, not in terms of ancient history content. For example:
- GOOD: "apply_review_answers corrupts footnote references when document has >5 citations"
- BAD: "apply_review_answers broke the Ramesses II document"
- GOOD: "[feature] check_repository should detect duplicate entities across different folders"
- BAD: "[feature] factbase should know that Babylon appears in both civilizations/ and cities/"
Include in every task description:
- Which MCP tool was called (or which tool is missing/needed)
- What you expected to happen vs what actually happened
- Specific parameters or conditions that triggered the issue
- For features: the general use case and how it would help ANY factbase, not just this one
curl -s -X PUT "https://vikunja.home.everyonce.com/api/v1/projects/2/tasks" \
-H "Authorization: Bearer tk_ff251f3d3512775c71913bc2f8ec0dabbf5016a8" \
-H "Content-Type: application/json" \
-d '"'"'{"title":"[factbase] <summary>","description":"<detailed description>","priority":3}'"'"'
Priority: 1-2 for features/minor issues, 3 for normal bugs, 4-5 for data loss or blocking failures.
RULES:
- The document content is already provided above — do NOT call get_entity to re-read it
unless you just called apply_review_answers and need to verify the result
- Use update_document to edit — be surgical, change only what needs changing
- Do NOT run git add, git commit, or git push — the wrapper script handles all git operations
- If nothing needs changing, say so and move on
IMPORTANT: When finished, output exactly one line:
<action_summary>status: STATUS | '"$title"' | changes: DESCRIPTION</action_summary>
Status values: UPDATED (made changes), NO_CHANGE (nothing to do), ERROR (something failed)'
local output
output=$(kiro-cli chat --trust-all-tools --no-interactive --model "$MODEL" \
"$prompt" 2>&1) || {
log "❌ kiro-cli error for $title, continuing..."
return 1
}
echo "$output"
local summary
summary=$(echo "$output" | grep -oP '(?<=<action_summary>).*(?=</action_summary>)' | tail -1)
[[ -z "$summary" ]] && summary="status: UNKNOWN | $title | no summary returned"
echo "$summary"
}
# ─── Main entity processing dispatcher ───
process_entity() {
local fb_id="$1"
local file="$2"
local review_count="$3"
local mtime="$4"
local last_processed="$5"
local garbage_count="$6"
local incomplete_name="$7"
cd "$DOCS_DIR"
local title
title=$(grep '^# ' "$file" | head -1 | sed 's/^# //' | sed 's/ @t\[.*//;s/ \[\^.*//')
log "━━━ [$title] ($fb_id) reviews=$review_count garbage=$garbage_count ━━━"
local start_time
start_time=$(date +%s)
local status="NO_CHANGE"
local summary=""
# Phase 1: Bash cleanup (milliseconds, no agent)
if bash_cleanup "$file"; then
log " 🧹 Bash cleanup applied"
status="UPDATED"
# Recount after cleanup
review_count=$(grep -c '^\- \[ \] `@q\[' "$file" 2>/dev/null) || true
garbage_count=0
fi
# Phase 2: Decide if agent is needed
local needs_agent=true
if [[ "${incomplete_name:-0}" -eq 1 ]]; then
log " 👤 Incomplete name (ruler doc) → agent needed to resolve identity"
elif [[ "$review_count" -gt 0 ]]; then
log " 📋 $review_count review questions → agent needed"
elif [[ "$SKIP_UNCHANGED" == true && "$last_processed" -gt 0 && "$mtime" -le "$last_processed" ]]; then
needs_agent=false
log " ⏭️ No questions, not modified since last pass → skipping (--skip-unchanged)"
else
log " 🔍 Enrichment + review pass"
fi
if [[ "$needs_agent" == true ]]; then
local agent_output
agent_output=$(process_entity_agent "$fb_id" "$file" "$title" "$review_count")
echo "$agent_output"
summary=$(echo "$agent_output" | grep -oP '(?<=<action_summary>).*(?=</action_summary>)' | tail -1)
local agent_status
agent_status=$(echo "$summary" | grep -oP '^status: \K[A-Z_]+' || echo "UNKNOWN")
if [[ "$agent_status" == "UPDATED" ]]; then
status="UPDATED"
fi
fi
local end_time
end_time=$(date +%s)
local duration=$((end_time - start_time))
if [[ "$status" == "UPDATED" ]]; then
do_commit "improve: $title"
fi
[[ -z "$summary" ]] && summary="status: $status | $title | bash-only pass"
{
echo "[$(date -Iseconds)] $fb_id | $title"
echo " $summary"
echo " duration: ${duration}s"
} >> "$ACTION_LOG"
mark_processed "$fb_id"
log " Done (${duration}s) — $status"
[[ "$status" == "UPDATED" ]] && return 0 || return 1
}
# ═══════════════════════════════════════════
# DEEP CROSS-DOCUMENT SCAN (once per cycle)
# ═══════════════════════════════════════════
run_deep_scan() {
log "🔬 Running deep cross-document validation scan..."
local output
output=$(kiro-cli chat --trust-all-tools --no-interactive --model "$MODEL" \
"Run check_repository with deep_check=true. Review any new issues found — answer what you can, defer what you cannot. If any factbase tool behaves unexpectedly during this process, file a bug to Vikunja (project 2, Authorization: Bearer tk_ff251f3d3512775c71913bc2f8ec0dabbf5016a8). Describe issues in domain-agnostic terms — factbase is used for many knowledge bases, not just this one." 2>&1) || {
log "❌ Deep scan agent failed, continuing..."
return 1
}
echo "$output"
do_commit "deep scan: cross-document validation"
log "✅ Deep scan complete"
}
# ═══════════════════════════════════════════
# MAIN LOOP
# ═══════════════════════════════════════════
log "🚀 Starting continuous improvement loop (priority=$PRIORITY, model=$MODEL, start=$START_AT, skip_unchanged=$SKIP_UNCHANGED)"
log "Docs dir: $DOCS_DIR"
log "State file: $STATE_FILE"
log "Press Ctrl+C to stop"
PASS=0
while true; do
PASS=$((PASS + 1))
log ""
log "═══════════════════════════════════════════"
log " PASS $PASS$(TZ='America/Chicago' date '+%Y-%m-%d %r') — priority=$PRIORITY"
log "═══════════════════════════════════════════"
QUEUE=$(build_queue)
TOTAL=$(echo "$QUEUE" | grep -c . || echo 0)
log "Queue: $TOTAL entities (starting at $((START_AT + 1)))"
PROCESSED=0
UPDATED=0
SKIPPED=0
POSITION=0
while IFS=$'\t' read -r fb_id file review_count mtime last_processed garbage_count incomplete_name; do
[[ -z "$fb_id" ]] && continue
POSITION=$((POSITION + 1))
if [[ $POSITION -le $START_AT ]]; then
continue
fi
PROCESSED=$((PROCESSED + 1))
log "[$POSITION/$TOTAL] Next up..."
if process_entity "$fb_id" "$file" "$review_count" "$mtime" "$last_processed" "$garbage_count" "$incomplete_name"; then
UPDATED=$((UPDATED + 1))
fi
sleep "$CYCLE_DELAY"
done <<< "$QUEUE"
log ""
log "═══ Pass $PASS complete: $PROCESSED processed, $UPDATED updated ═══"
run_deep_scan
START_AT=0
log "Looping back to start..."
sleep "$CYCLE_DELAY"
done