"""Pass 1 of Q-factor categorization: discover clusters.

Modes:
  --discover    Read descriptions, call Claude, write proposed clusters to JSON.
  --load FILE   Load reviewed clusters from JSON into cecl_qfactor_categories.

Example:
  python cecl_qfactor_cluster.py --shortqtr=1q26 --discover --out clusters.json
  (edit clusters.json by hand to merge/rename)
  python cecl_qfactor_cluster.py --shortqtr=1q26 --load clusters.json
"""
from __future__ import annotations
import argparse
import json
import os
import sys

from anthropic import Anthropic

from cecl_db import connect, dedup_descriptions

MODEL = "claude-opus-4-7"

CLUSTER_PROMPT = """You are analyzing free-form qualitative-factor (Q-factor)
descriptions that community banks enter when configuring their CECL model.
Each description is a short phrase or sentence explaining a risk adjustment
the bank has applied to a loan pool.

Your job: propose a coherent set of 10-25 theme clusters that capture the
substance of what banks are citing. Clusters should:
  - be mutually distinct (a given phrase should clearly belong to one)
  - use plain-English, bank-auditor-readable names (e.g. "CRE concentration",
    "Inflation and supply chain", "Local economic softness")
  - avoid overly narrow clusters that would each only contain 1-2 phrases
  - avoid overly broad clusters that mix unrelated risks

Return STRICT JSON only. No prose. No markdown fences. Schema:
{
  "clusters": [
    {
      "name": "string, <= 60 chars",
      "description": "one sentence describing the theme",
      "sample_phrases": ["up to 5 representative phrases from the input"]
    }
  ]
}

The descriptions are below, one per line:
"""


def discover(shortqtr: str, out_path: str) -> None:
    conn = connect()
    cur = conn.cursor()
    cur.execute(
        "SELECT description FROM cecl_benchmark_customfactor_text WHERE shortqtr = %s",
        (shortqtr,),
    )
    raw = [r[0] for r in cur.fetchall()]
    cur.close()
    conn.close()
    print(f"Loaded {len(raw)} raw descriptions")

    deduped = dedup_descriptions(raw)
    print(f"Deduplicated to {len(deduped)} distinct phrases")

    client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
    prompt = CLUSTER_PROMPT + "\n".join(deduped)

    print(f"Calling {MODEL}...")
    resp = client.messages.create(
        model=MODEL,
        max_tokens=8000,
        messages=[{"role": "user", "content": [{"type": "text", "text": prompt, "cache_control": {"type": "ephemeral"}}]}],
    )

    text = resp.content[0].text.strip()
    try:
        parsed = json.loads(text)
    except json.JSONDecodeError as e:
        print(f"ERROR: response not valid JSON: {e}", file=sys.stderr)
        print(text, file=sys.stderr)
        sys.exit(1)

    with open(out_path, "w") as f:
        json.dump(parsed, f, indent=2)
    print(f"Wrote {len(parsed['clusters'])} proposed clusters to {out_path}")
    print("REVIEW + EDIT THIS FILE, THEN RUN WITH --load")


def load(shortqtr: str, path: str) -> None:
    with open(path) as f:
        data = json.load(f)
    conn = connect()
    cur = conn.cursor()
    for c in data["clusters"]:
        cur.execute(
            """INSERT INTO cecl_qfactor_categories (shortqtr, name, description)
               VALUES (%s, %s, %s)
               ON DUPLICATE KEY UPDATE description = VALUES(description),
                                        updated_at = CURRENT_TIMESTAMP""",
            (shortqtr, c["name"], c.get("description")),
        )
    conn.commit()
    cur.close()
    conn.close()
    print(f"Loaded {len(data['clusters'])} clusters for shortqtr={shortqtr}")


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--shortqtr", required=True)
    group = ap.add_mutually_exclusive_group(required=True)
    group.add_argument("--discover", action="store_true")
    group.add_argument("--load")
    ap.add_argument("--out", default="clusters.json")
    args = ap.parse_args()

    if args.discover:
        discover(args.shortqtr, args.out)
    else:
        load(args.shortqtr, args.load)


if __name__ == "__main__":
    main()
