danielscholl · February 9, 2026 21:40
diff --git a/document_agent.py b/document_agent.py
 # /// script
 # requires-python = ">=3.10"
 # dependencies = [
 #     "streamlit>=1.54",
 #     "langchain-core>=1.2",
 #     "langchain-classic>=1.0",
 #     "langchain-openai>=1.1",
 #     "langchain-chroma>=1.1",
 #     "langchain-community>=0.4",
 #     "langchain-text-splitters>=1.1",
 #     "tiktoken>=0.12",
 #     "pypdf>=6.0",
 #     "watchdog>=6.0",
 # ]
 # ///

 # BUSINESS SCIENCE UNIVERSITY
 # PYTHON FOR GENERATIVE AI COURSE
 # DYNAMIC RAG APPLICATION CHALLENGE
 # ***
 #
 # Document Reference Assistant — A Dynamic RAG Agent
 #
 # WHY THIS MATTERS (Big Picture):
 #   This is a complete, single-file RAG application that lets you upload any PDF
 #   and immediately chat with it. It demonstrates how to build an AI agent that:
 #     1. INGESTS a PDF  — extracts text, splits into chunks, generates embeddings
 #     2. INDEXES chunks — stores vectors in a Chroma database for similarity search
 #     3. RETRIEVES context — finds the most relevant chunks for each user question
 #     4. GENERATES answers — uses an LLM with retrieved context to produce cited responses
 #     5. MANAGES conversation — maintains chat history for multi-turn dialogue
 #
 #   Architecture:
 #     Upload PDF → Extract Pages → Chunk Text → Embed & Store → Chat Loop
 #                                                                  ↓
 #                                          User Question → Retrieve Chunks → LLM → Answer
 #
 #   Key concepts covered:
 #     - Self-launching Streamlit app (no separate "streamlit run" command needed)
 #     - Preflight validation (API key + model availability checks)
 #     - PDF loading and token-aware text chunking
 #     - AI-powered metadata extraction (title, topics, suggested questions)
 #     - Vector embeddings and Chroma vector store
 #     - History-aware retrieval (reformulates questions using chat context)
 #     - RAG chain assembly with LangChain
 #     - Session persistence (resume previous document on reload)
 #     - Dynamic follow-up question generation
 #
 # Run with uv (no conda needed):
 #   uv run --script document-agent.py


 # =============================================================================
 # 0.0 SELF-LAUNCH BOOTSTRAP
 # =============================================================================
 # WHY? Normally you'd run "streamlit run app.py" in the terminal. This bootstrap
 # lets you just run "uv run --script document-agent.py" — the script detects it's
 # being run directly (not by Streamlit) and re-launches itself via Streamlit.
 #
 # HOW IT WORKS:
 #   1. First run: __name__ == "__main__" and IN_STREAMLIT is not set
 #   2. We do preflight checks (API key, model access)
 #   3. We set IN_STREAMLIT=1 in the environment to prevent infinite recursion
 #   4. We call "streamlit run <this_file>" as a subprocess
 #   5. Streamlit runs this file again, but now IN_STREAMLIT=1, so we skip this block
 #      and fall through to the actual app code below

 import os
 import sys
 import subprocess

 if __name__ == "__main__" and os.environ.get("IN_STREAMLIT") != "1":

    # -- Preflight Checks -----------------------------------------------------
    # Before starting the app, verify we have a valid OpenAI API key and that
    # the required models are accessible. This catches configuration errors
    # early with clear messages instead of cryptic runtime failures.
    #
    # Required models:
    #   - text-embedding-ada-002  (embedding model for vectorizing document chunks and queries)
    #   - gpt-4.1-mini           (metadata extraction, follow-up question generation)

    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("ERROR: OPENAI_API_KEY environment variable is not set.")
        print("Export it before running:  export OPENAI_API_KEY='sk-...'")
        raise SystemExit(1)

    try:
        from openai import OpenAI
        client = OpenAI(api_key=api_key)
        # client.models.list() calls the OpenAI API to get all models your key can access
        available = {m.id for m in client.models.list()}
        required = {"text-embedding-ada-002", "gpt-4.1-mini"}
        missing = required - available
        if missing:
            print(f"ERROR: Required model(s) not available: {', '.join(sorted(missing))}")
            print("Check your OpenAI plan/access and try again.")
            raise SystemExit(1)
        print("Preflight OK — API key valid, required models available.")
    except SystemExit:
        raise
    except Exception as e:
        print(f"ERROR: Failed to verify OpenAI API access — {e}")
        raise SystemExit(1)

    # -- Streamlit Theme Configuration ----------------------------------------
    # Create a .streamlit/config.toml file for the dark theme if it doesn't exist.
    # Streamlit reads this file automatically on startup.

    script_dir = os.path.dirname(os.path.abspath(__file__))
    st_config_dir = os.path.join(script_dir, ".streamlit")
    st_config_file = os.path.join(st_config_dir, "config.toml")
    if not os.path.exists(st_config_file):
        os.makedirs(st_config_dir, exist_ok=True)
        with open(st_config_file, "w") as f:
            f.write('[theme]\nbase="dark"\nprimaryColor="#d33682"\n'
                    'backgroundColor="#000000"\nsecondaryBackgroundColor="#586e75"\n'
                    'textColor="#fafafa"\nfont="sans serif"\n')

    # -- Launch Streamlit -----------------------------------------------------
    # subprocess.call() runs Streamlit as a child process and waits for it to exit.
    # We pass --server.headless=true so it doesn't try to open a browser automatically.
    # SystemExit propagates the child's exit code to the parent process.

    env = os.environ.copy()
    env["IN_STREAMLIT"] = "1"
    raise SystemExit(
        subprocess.call(
            [sys.executable, "-m", "streamlit", "run", __file__,
             "--server.port", "8501", "--server.headless", "true"],
            env=env,
            cwd=script_dir,
        )
    )


 # =============================================================================
 # 1.0 IMPORTS
 # =============================================================================
 # Everything below this point runs INSIDE Streamlit (the bootstrap above is skipped).

 # Chroma: Open-source vector database. Stores document embeddings on disk and
 #   lets you do similarity search (find the most relevant documents for a query).
 from langchain_chroma import Chroma

 # ChatPromptTemplate: Defines structured prompts with system/human/AI message roles.
 # MessagesPlaceholder: A slot in a prompt template where chat history gets injected.
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

 # Document: LangChain's standard container — holds page_content (text) + metadata (dict).
 from langchain_core.documents import Document

 # ChatOpenAI: Wrapper around OpenAI's chat completion API (gpt-4, gpt-4o, etc.).
 # OpenAIEmbeddings: Wrapper around OpenAI's embedding API — converts text into vectors
 #   (lists of ~1536 numbers) that capture semantic meaning. Similar text → similar vectors.
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings

 # create_history_aware_retriever: Wraps a retriever so it reformulates the user's
 #   question using chat history BEFORE searching. This way "Tell me more about that"
 #   becomes "Tell me more about [the topic from the previous answer]".
 # create_retrieval_chain: Combines a retriever + an LLM chain into a single pipeline:
 #   question → retrieve docs → pass docs + question to LLM → answer.
 from langchain_classic.chains import create_history_aware_retriever, create_retrieval_chain

 # create_stuff_documents_chain: The simplest document combination strategy — "stuff"
 #   all retrieved documents into the prompt context at once (vs. map-reduce or refine).
 #   Works well when chunks are small enough to fit in the context window.
 from langchain_classic.chains.combine_documents import create_stuff_documents_chain

 # RunnableWithMessageHistory: Wraps any chain to automatically load/save chat history.
 #   Each invocation reads prior messages, appends the new exchange, and persists them.
 from langchain_core.runnables.history import RunnableWithMessageHistory

 # StreamlitChatMessageHistory: Stores chat messages in Streamlit's session_state.
 #   This means history survives re-runs (Streamlit re-executes the script on every
 #   interaction) but is lost when the browser tab closes.
 from langchain_community.chat_message_histories import StreamlitChatMessageHistory

 # PyPDFLoader: Reads a PDF file and returns a list of Document objects (one per page).
 from langchain_community.document_loaders import PyPDFLoader

 # RecursiveCharacterTextSplitter: Splits long documents into smaller chunks for embedding.
 #   "Recursive" means it tries splitting on paragraphs first, then sentences, then words —
 #   preserving natural boundaries. from_tiktoken_encoder() counts tokens (not characters)
 #   so chunks respect the embedding model's token limits.
 from langchain_text_splitters import RecursiveCharacterTextSplitter

 # Streamlit: The web framework that powers the UI. It re-runs this entire script from
 #   top to bottom on every user interaction (button click, text input, etc.).
 #   st.session_state persists data across these re-runs.
 import streamlit as st

 import os
 import json
 import re
 import html as html_module  # renamed to avoid collision with streamlit's html support
 import tempfile
 import shutil
 import time
 from pathlib import Path


 # =============================================================================
 # 2.0 CONFIGURATION
 # =============================================================================
 # Central place for all tunable parameters. Change these to experiment with
 # different models, chunk sizes, or storage locations.

 BASE_DIR = Path(__file__).parent

 # Where the Chroma vector database is stored on disk.
 # This directory contains the embedded chunks from the uploaded PDF.
 VECTOR_DATABASE = str(BASE_DIR / "data" / "chroma_document.db")

 # JSON file that persists document metadata between browser sessions.
 # When you reload the page, the app reads this to skip re-uploading.
 DOCUMENT_STATE_FILE = str(BASE_DIR / "data" / "document_state.json")

 # EMBEDDING MODEL: text-embedding-ada-002
 #   - Outputs a 1536-dimensional vector for each text input
 #   - Used for BOTH indexing documents AND embedding user queries
 #   - CRITICAL: The same model must be used for indexing and querying,
 #     otherwise the vectors won't be in the same "space" and search fails
 #   - Resource: https://platform.openai.com/docs/models
 EMBEDDING_MODEL = "text-embedding-ada-002"

 # METADATA MODEL: Used for lightweight AI tasks (extracting document metadata,
 # generating follow-up questions). A small, fast, cheap model works best here
 # since these tasks don't need deep reasoning.
 METADATA_MODEL = "gpt-4.1-mini"

 # CHAT MODELS — Three tiers the user can select in the sidebar.
 # Each tier offers a meaningful difference in quality/cost/speed:
 #
 # Standard models (accept temperature parameter):
 #   gpt-4o-mini   — Cheapest & fastest. Good for simple Q&A. ~$0.15/$0.60 per 1M tokens.
 #   gpt-4.1-mini  — Best balance of quality and cost. ~$0.40/$1.60 per 1M tokens. (DEFAULT)
 #   gpt-4.1       — Most capable standard model. ~$2.00/$8.00 per 1M tokens.
 #
 # Reasoning models (do NOT accept temperature — they self-regulate):
 #   o3-mini       — Cheapest reasoning model. Good for basic analysis. ~$1.10/$4.40 per 1M tokens.
 #   o4-mini       — Best balance for reasoning tasks. ~$1.10/$4.40 per 1M tokens. (DEFAULT)
 #   o3            — Deepest reasoning. Slow but thorough. ~$2.00/$8.00 per 1M tokens.
 STANDARD_MODELS = ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1"]
 REASONING_MODELS = ["o3-mini", "o4-mini", "o3"]

 # CHUNKING PARAMETERS:
 #   CHUNK_SIZE: Target size of each text chunk in tokens (not characters).
 #     Smaller chunks = more precise retrieval but less context per chunk.
 #     Larger chunks = more context but may include irrelevant text.
 #     500 tokens is a good default (~375 words, ~1/3 of a page).
 #   CHUNK_OVERLAP: How many tokens overlap between consecutive chunks.
 #     This prevents information from being split across chunk boundaries.
 #     50 tokens of overlap means the end of chunk N appears at the start of chunk N+1.
 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50

 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


 # =============================================================================
 # 3.0 PAGE SETUP & STYLING
 # =============================================================================
 # Streamlit page configuration must be the first st.* call in the script.
 # layout="wide" uses the full browser width instead of a narrow centered column.

 st.set_page_config(
    page_title="Document Assistant",
    page_icon="docs",
    layout="wide",
    initial_sidebar_state="collapsed"
 )

 # Custom CSS injected via st.markdown with unsafe_allow_html=True.
 # Streamlit's default styling is functional but generic — this CSS gives the app
 # a polished, professional look with a dark-themed header, styled source cards, etc.
 st.markdown("""
 <style>
    /* Layout */
    .block-container { max-width: 1400px; padding-top: 1rem; }

    /* Header bar — gradient background with accent border */
    .app-header {
        background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
        color: #e8e8e8;
        padding: 20px 28px;
        border-radius: 8px;
        margin-bottom: 4px;
        border-left: 4px solid #4a90d9;
    }
    .app-header h1 { margin: 0; font-size: 1.6rem; font-weight: 600; }
    .app-header p  { margin: 4px 0 0 0; opacity: 0.85; font-size: 0.9rem; }

    /* Document status row — shows filename, chunk count, model */
    .doc-status-row {
        display: flex;
        flex-wrap: wrap;
        gap: 20px;
        padding: 10px 16px;
        background: #f0f4f8;
        border-radius: 6px;
        margin-bottom: 16px;
        font-size: 0.82rem;
        color: #495057;
        border-left: 3px solid #4a90d9;
    }
    .doc-status-row .status-item { white-space: nowrap; }
    .doc-status-row .status-label { color: #6c757d; margin-right: 4px; }

    /* Question chips — clickable suggested/follow-up questions */
    .chip-container { display: flex; flex-wrap: wrap; gap: 8px; margin: 8px 0 16px 0; }
    .q-chip {
        display: inline-block;
        background: #e9ecef;
        color: #343a40;
        padding: 6px 14px;
        border-radius: 20px;
        font-size: 0.82rem;
        cursor: pointer;
        border: 1px solid #dee2e6;
        transition: all 0.15s ease;
    }
    .q-chip:hover { background: #4a90d9; color: white; border-color: #4a90d9; }

    /* Source card — displays retrieved chunk content */
    .source-card {
        background: #f8f9fa;
        border-left: 3px solid #4a90d9;
        border-radius: 6px;
        padding: 10px 14px;
        margin-bottom: 10px;
        font-size: 0.82rem;
        line-height: 1.5;
    }
    .source-card .source-meta {
        color: #6c757d;
        font-size: 0.75rem;
        margin-bottom: 4px;
    }

    /* Processing log — terminal-style output during PDF indexing */
    .process-log {
        background: #1a1a2e;
        color: #a8d8a8;
        border-radius: 6px;
        padding: 12px 16px;
        font-family: 'Menlo', 'Consolas', monospace;
        font-size: 0.78rem;
        line-height: 1.8;
        max-height: 180px;
        overflow-y: auto;
    }

    /* Info card on upload page */
    .info-card {
        background: #f8f9fa;
        border-radius: 8px;
        padding: 14px 18px;
        margin: 8px 0;
        border-left: 3px solid #4a90d9;
        font-size: 0.88rem;
    }

    /* Sidebar compact */
    section[data-testid="stSidebar"] { width: 280px !important; }
    section[data-testid="stSidebar"] .block-container { padding-top: 1rem; }

    /* Footer */
    .app-footer {
        text-align: center; color: #6c757d; font-size: 0.75rem;
        padding: 12px 0; border-top: 1px solid #dee2e6; margin-top: 24px;
    }

    /* Highlighted search terms in source cards */
    .source-card mark {
        background: #fff3cd;
        padding: 1px 3px;
        border-radius: 2px;
        font-weight: 500;
    }

    /* Source summary bar — "Retrieved N chunks covering pages X, Y, Z" */
    .source-summary {
        background: #e8f4f8;
        border-radius: 6px;
        padding: 8px 12px;
        margin-bottom: 12px;
        font-size: 0.78rem;
        color: #495057;
    }

    /* Low confidence warning */
    .confidence-warning {
        background: #fff3cd;
        border-left: 3px solid #ffc107;
        border-radius: 6px;
        padding: 10px 14px;
        margin: 8px 0;
        font-size: 0.82rem;
        color: #856404;
    }

    /* Hide default Streamlit branding */
    #MainMenu {visibility: hidden;}
    footer {visibility: hidden;}
 </style>
 """, unsafe_allow_html=True)


 # =============================================================================
 # 4.0 SESSION STATE
 # =============================================================================
 # Streamlit re-runs the entire script on every interaction. Session state is how
 # we persist data across those re-runs. Think of it as a per-user dictionary that
 # survives button clicks and text input but resets when the browser tab closes.
 #
 # DEFAULTS defines every key we use and its initial value. On each re-run, we
 # only set keys that don't exist yet (preserving values from previous interactions).

 DEFAULTS = {
    "document_ready": False,        # True once a PDF has been indexed
    "document_metadata": None,      # AI-extracted: title, topics, questions, etc.
    "uploaded_filename": None,      # Original filename of the uploaded PDF
    "uploaded_filesize": None,      # File size in bytes
    "chunk_count": None,            # Number of chunks in the vector store
    "page_count": None,             # Number of pages in the PDF
    "last_sources": [],             # Retrieved documents for the latest answer
    "last_query": None,             # Last user question (for highlighting terms)
    "chip_question": None,          # Set when user clicks a suggested question chip
    "followup_questions": [],       # AI-generated follow-up questions after each answer
 }
 for key, val in DEFAULTS.items():
    if key not in st.session_state:
        st.session_state[key] = val


 # -- Session Restore ----------------------------------------------------------
 # WHY? Without this, refreshing the browser would lose the loaded document and
 # force re-uploading. We persist document metadata to a JSON file on disk (alongside
 # the Chroma vector DB). On startup, if both exist, we restore session state so
 # the user goes straight to the chat screen.

 if (
    not st.session_state.document_ready
    and os.path.exists(VECTOR_DATABASE)
    and os.path.exists(DOCUMENT_STATE_FILE)
 ):
    try:
        with open(DOCUMENT_STATE_FILE) as f:
            saved = json.load(f)
        st.session_state.document_ready = True
        st.session_state.document_metadata = saved.get("document_metadata")
        st.session_state.uploaded_filename = saved.get("uploaded_filename")
        st.session_state.uploaded_filesize = saved.get("uploaded_filesize")
        st.session_state.chunk_count = saved.get("chunk_count")
        st.session_state.page_count = saved.get("page_count")
    except Exception:
        pass  # corrupt file — fall through to upload screen


 # =============================================================================
 # 5.0 CORE FUNCTIONS — RAG Pipeline
 # =============================================================================
 # These functions implement the RAG pipeline:
 #   get_embedding_function()    → creates the embedding model (cached)
 #   get_vectorstore()           → connects to the Chroma vector database
 #   extract_document_metadata() → uses AI to extract title, topics, questions from PDF
 #   process_uploaded_pdf()      → full ingestion pipeline (load → chunk → embed → store)
 #   create_rag_chain()          → assembles the retrieval + generation chain
 #   generate_followup_questions() → generates contextual follow-up suggestions
 #   highlight_terms()           → highlights search terms in source text
 #   reset_document()            → clears everything for a fresh upload


 # * 5.1 Embedding Function (Cached)
 # -----------------------------------
 # @st.cache_resource means this function runs ONCE and the result is reused across
 # all re-runs and all users. This avoids creating a new OpenAIEmbeddings object
 # (and its HTTP connection pool) on every single Streamlit re-run.
 #
 # CRITICAL: The same embedding model must be used for BOTH indexing and querying.

 @st.cache_resource(show_spinner=False)
 def get_embedding_function():
    return OpenAIEmbeddings(model=EMBEDDING_MODEL)


 # * 5.2 Vector Store Connection
 # ------------------------------
 # Connects to an existing Chroma database on disk. Does NOT create a new one —
 # that happens in process_uploaded_pdf(). Returns None if no document is loaded.

 def get_vectorstore():
    if not st.session_state.document_ready:
        return None
    return Chroma(
        persist_directory=VECTOR_DATABASE,
        embedding_function=get_embedding_function()
    )


 # * 5.3 AI-Powered Metadata Extraction
 # --------------------------------------
 # WHY? Instead of showing the raw filename as the app title, we use a small LLM
 # to analyze the first few pages and extract structured metadata: a professional
 # title, topics, suggested questions, and a welcome message. This makes the chat
 # experience feel tailored to each document.
 #
 # HOW: We take the first ~4000 chars from the first 5 pages, send them to
 # gpt-4.1-mini with a structured prompt, and parse the JSON response.
 # If anything fails, we fall back to sensible defaults derived from the filename.

 def extract_document_metadata(documents: list, filename: str) -> dict:
    sample_content = ""
    for doc in documents[:5]:
        sample_content += doc.page_content[:1000] + "\n\n"
    sample_content = sample_content[:4000]

    llm = ChatOpenAI(model=METADATA_MODEL, temperature=0.3)
    prompt = ChatPromptTemplate.from_template("""Analyze this document excerpt and extract metadata.

 Document filename: {filename}

 Content sample:
 {content}

 Return a JSON object with these exact keys:
 {{
    "title": "A concise, professional title for this document (max 50 chars)",
    "subtitle": "A brief description of what this document covers (max 100 chars)",
    "topics": ["topic1", "topic2", ...],  // 8-15 key topics/concepts covered
    "chapters": ["Chapter 1: Title", "Chapter 2: Title", ...],  // Main sections/chapters if identifiable from TOC or headings. Empty list if none.
    "example_questions": [
        "Question 1 that a reader might ask about this content",
        "Question 2...",
        ...
    ],  // 6-8 diverse example questions
    "welcome_message": "A brief welcome message for a chat assistant (max 150 chars)"
 }}

 Return ONLY valid JSON, no other text.""")

    try:
        response = llm.invoke(prompt.format(filename=filename, content=sample_content))
        # Strip markdown code fences if the LLM wraps the JSON in ```json ... ```
        json_str = response.content.strip()
        if json_str.startswith("```"):
            json_str = json_str.split("```")[1]
            if json_str.startswith("json"):
                json_str = json_str[4:]
        return json.loads(json_str)
    except Exception:
        # Fallback: derive basic metadata from the filename
        return {
            "title": filename.replace(".pdf", "").replace("_", " ").title(),
            "subtitle": "Document Reference Assistant",
            "topics": ["Overview", "Key Concepts", "Details", "Summary"],
            "example_questions": [
                "What is the main topic of this document?",
                "Can you summarize the key points?",
                "What are the important concepts covered?",
                "How does this relate to practical applications?"
            ],
            "welcome_message": f"I can answer questions about {filename}. What would you like to know?"
        }


 # * 5.4 PDF Ingestion Pipeline
 # ------------------------------
 # This is the main ingestion function. It takes a raw uploaded PDF and produces
 # a fully searchable vector store. The pipeline:
 #
 #   1. Save uploaded bytes to a temp file (PyPDFLoader needs a file path)
 #   2. Extract text from every page → list of Document objects
 #   3. AI metadata extraction (title, topics, questions)
 #   4. Token-based chunking (split pages into ~500 token chunks)
 #   5. Create a TOC document (synthetic document with document overview)
 #   6. Generate embeddings via OpenAI and store in Chroma
 #   7. Persist document state to disk for session restore
 #
 # The log_container and progress_bar provide real-time feedback in the UI.

 def process_uploaded_pdf(uploaded_file, log_container, progress_bar) -> tuple[bool, str]:
    logs = []

    def log(msg, pct):
        logs.append(msg)
        log_container.markdown(
            '<div class="process-log">' + "<br>".join(f"&gt; {l}" for l in logs) + "</div>",
            unsafe_allow_html=True
        )
        progress_bar.progress(pct)

    try:
        # Step 1: Save to temp file
        log("Saving uploaded file...", 0.05)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            tmp.write(uploaded_file.getvalue())
            tmp_path = tmp.name

        # Step 2: Extract text from PDF
        # PyPDFLoader reads each page into a Document with page_content and metadata (page number).
        log("Extracting text from PDF...", 0.15)
        loader = PyPDFLoader(tmp_path)
        documents = loader.load()
        if not documents:
            return False, "No content could be extracted from the PDF."
        log(f"  Found {len(documents)} pages", 0.20)

        # Step 3: AI metadata extraction
        log("Analyzing document with AI...", 0.25)
        metadata = extract_document_metadata(documents, uploaded_file.name)
        log(f'  Title: "{metadata.get("title", "?")}"', 0.35)

        # Step 4: Token-based chunking
        # WHY tokens, not characters? LLMs and embedding models have TOKEN limits.
        # 500 tokens ≈ 375 words ≈ 1/3 of a page. Overlap of 50 tokens ensures
        # information at chunk boundaries isn't lost.
        log("Token-based chunking...", 0.40)
        splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            model_name=EMBEDDING_MODEL,
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
        )
        chunks = splitter.split_documents(documents)
        for i, chunk in enumerate(chunks):
            chunk.metadata["source_file"] = uploaded_file.name
            chunk.metadata["chunk_index"] = i
        log(f"  {len(chunks)} chunks created ({CHUNK_SIZE} tokens each)", 0.50)

        # Step 5: Create a synthetic TOC document
        # WHY? When users ask broad questions like "What is this document about?",
        # the individual chunks may not contain a good overview. This TOC document
        # acts as a summary that the retriever can find for high-level questions.
        toc_doc = Document(
            page_content=(
                f"Document: {metadata.get('title', uploaded_file.name)}\n"
                f"Description: {metadata.get('subtitle', '')}\n"
                f"Pages: {len(documents)}, Chunks: {len(chunks)}\n"
                f"Topics: {', '.join(metadata.get('topics', []))}"
            ),
            metadata={"source": "Table of Contents", "source_file": uploaded_file.name}
        )
        chunks.insert(0, toc_doc)

        # Step 6: Generate embeddings and store in Chroma
        # Chroma.from_documents() sends all chunk texts to OpenAI's embedding API,
        # gets back vectors, and stores everything in the persist_directory folder.
        # If a previous database exists, we close its connections and remove it first.
        log("Generating embeddings & indexing...", 0.55)
        if os.path.exists(VECTOR_DATABASE):
            try:
                old_store = Chroma(
                    persist_directory=VECTOR_DATABASE,
                    embedding_function=get_embedding_function()
                )
                if hasattr(old_store, "_client"):
                    old_store._client.clear_system_cache()
                del old_store
            except Exception:
                pass
            shutil.rmtree(VECTOR_DATABASE, ignore_errors=True)

        Chroma.from_documents(
            chunks,
            embedding=get_embedding_function(),
            persist_directory=VECTOR_DATABASE
        )
        log("  Vector store created", 0.90)

        os.unlink(tmp_path)

        # Step 7: Persist to session state (for this browser session) and
        # to disk (for surviving page refreshes / new browser sessions)
        st.session_state.document_ready = True
        st.session_state.uploaded_filename = uploaded_file.name
        st.session_state.uploaded_filesize = uploaded_file.size
        st.session_state.document_metadata = metadata
        st.session_state.chunk_count = len(chunks)
        st.session_state.page_count = len(documents)
        if "langchain_messages" in st.session_state:
            st.session_state.langchain_messages = []

        with open(DOCUMENT_STATE_FILE, "w") as f:
            json.dump({
                "uploaded_filename": uploaded_file.name,
                "uploaded_filesize": uploaded_file.size,
                "document_metadata": metadata,
                "chunk_count": len(chunks),
                "page_count": len(documents),
            }, f)

        log("Done.", 1.0)
        return True, f"{len(documents)} pages | {len(chunks)} chunks indexed"

    except Exception as e:
        return False, f"Error: {e}"


 # * 5.5 RAG Chain Assembly
 # --------------------------
 # This is the heart of the application — it assembles the full RAG pipeline:
 #
 #   User Question
 #       ↓
 #   [History-Aware Retriever]
 #       Uses chat history to reformulate the question into a standalone query.
 #       Example: "Tell me more" → "Tell me more about vector embeddings"
 #       Then searches Chroma for the k most relevant chunks.
 #       ↓
 #   [Stuff Documents Chain]
 #       Takes the retrieved chunks and "stuffs" them into the system prompt
 #       as context for the LLM.
 #       ↓
 #   [LLM (ChatOpenAI)]
 #       Generates an answer using the context + question + chat history.
 #       ↓
 #   [RunnableWithMessageHistory]
 #       Automatically loads/saves chat history to StreamlitChatMessageHistory.
 #       This wraps everything so history management is transparent.
 #
 # Two modes:
 #   Standard: Fast, concise answers with 4 retrieved chunks
 #   Reasoning: Deeper analysis with 6 retrieved chunks (uses reasoning models like o3)

 def create_rag_chain(model_name: str, temp: float = None, is_reasoning: bool = False):
    vectorstore = get_vectorstore()
    if vectorstore is None:
        return None, None

    metadata = st.session_state.document_metadata or {}
    source_name = metadata.get("title", st.session_state.uploaded_filename or "Document")

    # Retrieve more chunks for reasoning mode (deeper analysis needs more context)
    k_docs = 6 if is_reasoning else 4
    retriever = vectorstore.as_retriever(search_kwargs={"k": k_docs})

    # Reasoning models (o3, o4-mini) don't accept a temperature parameter
    llm = ChatOpenAI(model=model_name) if is_reasoning else ChatOpenAI(model=model_name, temperature=temp)

    # -- Stage 1: Question Reformulation --------------------------------------
    # WHY? In a multi-turn conversation, the user might say "Tell me more about that".
    # The retriever needs a standalone question to search effectively. This prompt
    # asks the LLM to reformulate the question using chat history context.
    contextualize_q_prompt = ChatPromptTemplate.from_messages([
        ("system", "Given a chat history and the latest user question, "
                   "formulate a standalone question. Do NOT answer it, just reformulate if needed."),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ])

    msgs = StreamlitChatMessageHistory(key="langchain_messages")
    history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)

    # -- Stage 2: Answer Generation -------------------------------------------
    # The system prompt tells the LLM its role, injects retrieved context via {context},
    # and instructs it to cite page numbers using [p.X] format.
    mode_text = (
        "Provide clear, concise answers. "
        "If the user asks you to summarize, find, or explain, follow their instruction naturally."
    )
    if is_reasoning:
        mode_text = (
            "ANALYSIS MODE: Provide deep, thorough explanations. "
            "Connect concepts across sections, provide examples, explain underlying principles. "
            "If the user asks you to summarize, find, or explain, follow their instruction naturally."
        )

    citation_instruction = (
        "IMPORTANT: Include page references in your answer using [p.X] format "
        "(where X is the page number from the source metadata). "
        "Place citations at the end of sentences that reference specific information. "
        "If the retrieved context does not contain enough information to answer confidently, "
        "clearly state that the answer may not be fully covered in the document."
    )

    qa_prompt = ChatPromptTemplate.from_messages([
        ("system",
         f"You are an expert assistant for: {source_name}\n\n"
         "{context}\n\n"
         f"{mode_text}\n\n"
         f"{citation_instruction}\n"
         "Use markdown formatting for readability."),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ])

    # -- Stage 3: Chain Assembly ----------------------------------------------
    # create_stuff_documents_chain: LLM + prompt that expects {context} filled with docs
    # create_retrieval_chain: retriever → stuff chain (auto-fills {context} with results)
    # RunnableWithMessageHistory: wraps everything with automatic chat history
    qa_chain = create_stuff_documents_chain(llm, qa_prompt)
    rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)

    chain_with_history = RunnableWithMessageHistory(
        rag_chain,
        lambda session_id: msgs,
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer",
    )
    return chain_with_history, retriever


 # * 5.6 Source Text Highlighting
 # --------------------------------
 # When displaying retrieved source chunks, we highlight words from the user's
 # query so they can quickly see why each chunk was relevant.
 # Common stop words are excluded to avoid highlighting "what", "the", etc.

 def highlight_terms(text: str, query: str | None) -> str:
    text = html_module.escape(text)
    text = text.replace("\n", "<br>")
    if not query:
        return text
    stop_words = {
        "what", "where", "when", "which", "that", "this", "with", "from",
        "have", "does", "about", "how", "the", "and", "for", "are", "but",
        "not", "you", "all", "can", "was", "one", "our", "out", "into",
    }
    words = [w for w in query.split() if len(w) > 3 and w.lower() not in stop_words]
    for word in words:
        pattern = re.compile(f"({re.escape(word)})", re.IGNORECASE)
        text = pattern.sub(r"<mark>\1</mark>", text)
    return text


 # * 5.7 Follow-Up Question Generation
 # --------------------------------------
 # WHY? After each answer, we generate contextual follow-up questions so the user
 # doesn't have to think of what to ask next. This creates a guided exploration
 # experience — the AI suggests logical next questions based on what was just discussed.
 #
 # HOW: Send the Q&A pair to a small LLM and ask for 4 follow-up questions as JSON.

 def generate_followup_questions(question: str, answer: str) -> list[str]:
    llm = ChatOpenAI(model=METADATA_MODEL, temperature=0.5)
    prompt = ChatPromptTemplate.from_template(
        "Based on this Q&A exchange, suggest 4 brief follow-up questions the user might ask next.\n\n"
        "Question: {question}\n\n"
        "Answer: {answer}\n\n"
        "Return ONLY a JSON array of 4 short question strings, no other text."
    )
    try:
        response = llm.invoke(prompt.format(question=question, answer=answer[:2000]))
        text = response.content.strip()
        if text.startswith("```"):
            text = text.split("```")[1]
            if text.startswith("json"):
                text = text[4:]
        return json.loads(text)[:4]
    except Exception:
        return []


 # * 5.8 Document Reset
 # ----------------------
 # Clears all session state, chat history, the vector database, and the persisted
 # state file. Used when the user clicks "New Document" to start fresh.

 def reset_document():
    for key in DEFAULTS:
        st.session_state[key] = DEFAULTS[key]
    if "langchain_messages" in st.session_state:
        st.session_state.langchain_messages = []
    # Close Chroma's SQLite connection before deleting the database directory,
    # otherwise the OS may refuse to remove locked files.
    if os.path.exists(VECTOR_DATABASE):
        try:
            old_store = Chroma(
                persist_directory=VECTOR_DATABASE,
                embedding_function=get_embedding_function()
            )
            if hasattr(old_store, "_client"):
                old_store._client.clear_system_cache()
            del old_store
        except Exception:
            pass
        shutil.rmtree(VECTOR_DATABASE, ignore_errors=True)
    if os.path.exists(DOCUMENT_STATE_FILE):
        os.remove(DOCUMENT_STATE_FILE)


 # =============================================================================
 # 6.0 MAIN APPLICATION — Two-Stage UI
 # =============================================================================
 # The app has two stages:
 #   Stage 1 (Upload):  Shown when no document is loaded. User uploads a PDF,
 #                       which is automatically processed and indexed.
 #   Stage 2 (Chat):    Shown after a document is indexed. Two-column layout with
 #                       chat on the left and retrieved sources on the right.
 #
 # Streamlit re-runs this entire script on every interaction. The if/else below
 # controls which stage is displayed based on session state.


 # ─────────────────────────────────────────────────────────────────────────────
 # STAGE 1: UPLOAD & INDEX
 # ─────────────────────────────────────────────────────────────────────────────
 if not st.session_state.document_ready:

    st.markdown("""
    <div class="app-header" style="text-align:center;">
        <h1>Document Reference Assistant</h1>
    </div>
    """, unsafe_allow_html=True)

    # Center the upload widget using a 3-column layout (empty | content | empty)
    col_l, col_c, col_r = st.columns([1, 2, 1])
    with col_c:
        st.markdown("<h4 style='text-align:center;'>Add a Document</h4>", unsafe_allow_html=True)

        uploaded_file = st.file_uploader(
            "Choose a PDF file", type=["pdf"],
            help="PDF documents up to 200 MB",
            label_visibility="collapsed"
        )

        # Auto-process: as soon as a file is selected, start the ingestion pipeline.
        # No "Process" button needed — reduces friction.
        if uploaded_file is not None:
            progress_bar = st.progress(0)
            log_area = st.empty()

            success, message = process_uploaded_pdf(uploaded_file, log_area, progress_bar)

            if success:
                st.success(message)
                time.sleep(0.6)
                st.rerun()  # Trigger a re-run to switch to Stage 2
            else:
                st.error(message)
        else:
            st.caption("Drag-and-drop or browse to select a PDF file.")


 # ─────────────────────────────────────────────────────────────────────────────
 # STAGE 2: CHAT WITH SOURCES
 # ─────────────────────────────────────────────────────────────────────────────
 else:
    # Extract metadata for display throughout the chat UI
    meta = st.session_state.document_metadata or {}
    title = meta.get("title", "Document Reference")
    subtitle = meta.get("subtitle", "")
    topics = meta.get("topics", [])
    chapters = meta.get("chapters", [])
    example_questions = meta.get("example_questions", [])
    welcome = meta.get("welcome_message", "How can I help you?")

    # -- Sidebar: Document Info + Settings ------------------------------------
    # The sidebar shows the current document, model settings, and topic pills.
    with st.sidebar:
        st.markdown(f"**{st.session_state.uploaded_filename}**")
        size_mb = (st.session_state.uploaded_filesize or 0) / 1024 / 1024
        st.caption(
            f"{size_mb:.1f} MB  |  "
            f"{st.session_state.page_count or '?'} pages  |  "
            f"{st.session_state.chunk_count or '?'} chunks"
        )

        if st.button("New Document", use_container_width=True):
            reset_document()
            st.rerun()

        st.divider()

        # Model selection — Standard mode allows temperature control,
        # Reasoning mode uses models like o3/o4-mini that don't accept temperature.
        with st.expander("Settings", expanded=False):
            model_type = st.radio("Mode", ["Standard", "Reasoning"], index=0,
                                  help="Reasoning = deeper analysis, slower")
            if model_type == "Standard":
                selected_model = st.selectbox("Model", STANDARD_MODELS, index=1)
                temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1)
            else:
                selected_model = st.selectbox("Model", REASONING_MODELS, index=1)
                temperature = None

        # Fallback defaults if the Settings expander was never opened
        if "model_type" not in dir():
            model_type = "Standard"
            selected_model = STANDARD_MODELS[0]
            temperature = 0.3

        st.divider()

        if st.button("Clear Chat", use_container_width=True):
            st.session_state.langchain_messages = []
            st.session_state.last_sources = []
            st.session_state.last_query = None
            st.rerun()

        st.divider()

        # Topic pills — visual summary of document topics from AI metadata extraction
        if topics:
            st.markdown("**Topics**")
            pills_html = " ".join(
                f'<span style="display:inline-block;background:#e9ecef;color:#495057;'
                f'padding:3px 10px;border-radius:14px;font-size:0.75rem;margin:2px;">'
                f'{t}</span>'
                for t in topics
            )
            st.markdown(pills_html, unsafe_allow_html=True)

    # -- Header — Document title and subtitle ---------------------------------
    st.markdown(
        f'<div class="app-header"><h1>{title}</h1><p>{subtitle}</p></div>',
        unsafe_allow_html=True
    )

    # -- Status Row — Compact bar showing document, chunk count, and model ----
    is_reasoning = (model_type == "Reasoning")
    status_items = [
        f'<span class="status-item"><span class="status-label">Document:</span> {st.session_state.uploaded_filename}</span>',
        f'<span class="status-item"><span class="status-label">Indexed:</span> {st.session_state.chunk_count or "?"} chunks</span>',
        f'<span class="status-item"><span class="status-label">Model:</span> {selected_model}</span>',
    ]
    st.markdown(
        '<div class="doc-status-row">' + " ".join(status_items) + '</div>',
        unsafe_allow_html=True
    )

    # -- Two-Column Layout: Chat (70%) | Sources (30%) ------------------------
    chat_col, sources_col = st.columns([7, 3], gap="medium")

    # ---- CHAT COLUMN --------------------------------------------------------
    with chat_col:

        # StreamlitChatMessageHistory stores messages in st.session_state["langchain_messages"].
        # On first load, we seed it with the AI-generated welcome message.
        msgs = StreamlitChatMessageHistory(key="langchain_messages")
        if len(msgs.messages) == 0:
            msgs.add_ai_message(welcome)

        # Build the RAG chain for the currently selected model
        rag_chain, retriever = create_rag_chain(
            selected_model, temperature, is_reasoning
        )

        # Render the full chat history
        for msg in msgs.messages:
            role = "assistant" if msg.type == "ai" else "user"
            st.chat_message(role).write(msg.content)

        # -- Suggested / Follow-Up Question Chips -----------------------------
        # Before any exchange: show static example questions from metadata.
        # After an exchange: show AI-generated follow-up questions that are
        # contextual to the last answer. Falls back to static examples if
        # follow-up generation failed.
        followups = st.session_state.followup_questions
        if len(msgs.messages) <= 1:
            if example_questions:
                st.markdown("**Suggested questions**")
                chip_cols = st.columns(min(len(example_questions), 4))
                for idx, q in enumerate(example_questions):
                    col = chip_cols[idx % len(chip_cols)]
                    if col.button(q, key=f"chip_{idx}", use_container_width=True):
                        st.session_state.chip_question = q
                        st.rerun()
        else:
            chips = followups if followups else example_questions
            if chips:
                st.markdown("**Suggested questions**")
                chip_cols = st.columns(min(len(chips), 4))
                for idx, q in enumerate(chips):
                    col = chip_cols[idx % len(chip_cols)]
                    if col.button(q, key=f"followup_{idx}", use_container_width=True):
                        st.session_state.chip_question = q
                        st.rerun()

        # -- Conversation Utility Buttons -------------------------------------
        # Visible after at least one full Q&A exchange. These inject pre-written
        # prompts as if the user typed them:
        #   Regenerate: re-asks the last question (removes the last Q&A pair first)
        #   More detail: asks for an expanded version of the last answer
        #   Shorter: asks for a condensed version
        if len(msgs.messages) > 2:
            u1, u2, u3, _ = st.columns([2, 2, 2, 6])
            if u1.button("Regenerate", key="util_regen", use_container_width=True):
                last_q = st.session_state.last_query
                if last_q and len(st.session_state.get("langchain_messages", [])) >= 2:
                    st.session_state.langchain_messages.pop()  # remove AI answer
                    st.session_state.langchain_messages.pop()  # remove user question
                    st.session_state.chip_question = last_q
                    st.session_state.last_sources = []
                    st.rerun()
            if u2.button("More detail", key="util_more", use_container_width=True):
                st.session_state.chip_question = (
                    "Please expand on your previous answer with more detail and page citations."
                )
                st.rerun()
            if u3.button("Shorter", key="util_shorter", use_container_width=True):
                st.session_state.chip_question = (
                    "Please give a shorter, more concise version of your previous answer."
                )
                st.rerun()

        # -- Question Input & RAG Invocation ----------------------------------
        # st.chat_input provides the text box at the bottom of the chat.
        # chip_question is set when the user clicks a suggested question button.
        # Either source triggers the RAG chain.
        typed_question = st.chat_input("Ask a question about the document...")
        question = st.session_state.chip_question or typed_question

        if st.session_state.chip_question:
            st.session_state.chip_question = None  # consume the chip question

        if question:
            st.chat_message("user").write(question)
            st.session_state.last_query = question

            spinner_text = "Analyzing..." if is_reasoning else "Searching..."
            with st.spinner(spinner_text):
                try:
                    if rag_chain:
                        # Invoke the full RAG pipeline:
                        #   1. Reformulate question using chat history
                        #   2. Retrieve relevant chunks from Chroma
                        #   3. Generate answer with retrieved context
                        #   4. Save Q&A to chat history automatically
                        response = rag_chain.invoke(
                            {"input": question},
                            config={"configurable": {"session_id": "doc_chat"}}
                        )
                        st.chat_message("assistant").write(response["answer"])
                        st.session_state.last_sources = response.get("context", [])

                        # Generate contextual follow-up questions based on this exchange
                        st.session_state.followup_questions = generate_followup_questions(
                            question, response["answer"]
                        )
                        # Rerun to display the new follow-up chips (they render above
                        # the input area, which has already been drawn by this point)
                        st.rerun()
                    else:
                        st.error("RAG chain not available. Try re-uploading the document.")
                except Exception as e:
                    st.error(f"Error: {e}")

    # ---- SOURCES COLUMN -----------------------------------------------------
    # Shows the retrieved chunks that the LLM used to generate its answer.
    # Each chunk is in a collapsible expander labeled with its page and chunk number.
    # Search terms from the user's query are highlighted with <mark> tags.
    with sources_col:
        st.markdown("**Sources**")
        sources = st.session_state.last_sources
        if sources:
            # Summary bar — shows how many chunks were retrieved and which pages
            pages_cited = sorted(set(
                str(d.metadata.get("page", "?"))
                for d in sources
                if d.metadata.get("page") is not None
            ))
            pages_str = ", ".join(pages_cited) if pages_cited else "N/A"
            st.markdown(
                f'<div class="source-summary">'
                f'Retrieved <strong>{len(sources)}</strong> chunks '
                f'covering pages <strong>{pages_str}</strong></div>',
                unsafe_allow_html=True,
            )

            for i, doc in enumerate(sources):
                page = doc.metadata.get("page", "?")
                chunk_idx = doc.metadata.get("chunk_index", "?")
                with st.expander(f"Page {page} · Chunk {chunk_idx}"):
                    full_hl = highlight_terms(
                        doc.page_content, st.session_state.last_query
                    )
                    st.markdown(full_hl, unsafe_allow_html=True)
        else:
            st.caption("Sources for the latest answer will appear here.")


 # =============================================================================
 # 7.0 FOOTER
 # =============================================================================
 st.markdown(
    '<div class="app-footer">Document Reference Assistant &mdash; '
    'Built with LangChain + Streamlit &mdash; Business Science University</div>',
    unsafe_allow_html=True
 )
No results found