Skip to content

Instantly share code, notes, and snippets.

@danielscholl
Created February 9, 2026 21:40
Show Gist options
  • Select an option

  • Save danielscholl/537dccd90f67b51acf35e555dc691ae7 to your computer and use it in GitHub Desktop.

Select an option

Save danielscholl/537dccd90f67b51acf35e555dc691ae7 to your computer and use it in GitHub Desktop.
AI Document Assistant
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "streamlit>=1.54",
# "langchain-core>=1.2",
# "langchain-classic>=1.0",
# "langchain-openai>=1.1",
# "langchain-chroma>=1.1",
# "langchain-community>=0.4",
# "langchain-text-splitters>=1.1",
# "tiktoken>=0.12",
# "pypdf>=6.0",
# "watchdog>=6.0",
# ]
# ///
# BUSINESS SCIENCE UNIVERSITY
# PYTHON FOR GENERATIVE AI COURSE
# DYNAMIC RAG APPLICATION CHALLENGE
# ***
#
# Document Reference Assistant — A Dynamic RAG Agent
#
# WHY THIS MATTERS (Big Picture):
# This is a complete, single-file RAG application that lets you upload any PDF
# and immediately chat with it. It demonstrates how to build an AI agent that:
# 1. INGESTS a PDF — extracts text, splits into chunks, generates embeddings
# 2. INDEXES chunks — stores vectors in a Chroma database for similarity search
# 3. RETRIEVES context — finds the most relevant chunks for each user question
# 4. GENERATES answers — uses an LLM with retrieved context to produce cited responses
# 5. MANAGES conversation — maintains chat history for multi-turn dialogue
#
# Architecture:
# Upload PDF → Extract Pages → Chunk Text → Embed & Store → Chat Loop
# ↓
# User Question → Retrieve Chunks → LLM → Answer
#
# Key concepts covered:
# - Self-launching Streamlit app (no separate "streamlit run" command needed)
# - Preflight validation (API key + model availability checks)
# - PDF loading and token-aware text chunking
# - AI-powered metadata extraction (title, topics, suggested questions)
# - Vector embeddings and Chroma vector store
# - History-aware retrieval (reformulates questions using chat context)
# - RAG chain assembly with LangChain
# - Session persistence (resume previous document on reload)
# - Dynamic follow-up question generation
#
# Run with uv (no conda needed):
# uv run --script document-agent.py
# =============================================================================
# 0.0 SELF-LAUNCH BOOTSTRAP
# =============================================================================
# WHY? Normally you'd run "streamlit run app.py" in the terminal. This bootstrap
# lets you just run "uv run --script document-agent.py" — the script detects it's
# being run directly (not by Streamlit) and re-launches itself via Streamlit.
#
# HOW IT WORKS:
# 1. First run: __name__ == "__main__" and IN_STREAMLIT is not set
# 2. We do preflight checks (API key, model access)
# 3. We set IN_STREAMLIT=1 in the environment to prevent infinite recursion
# 4. We call "streamlit run <this_file>" as a subprocess
# 5. Streamlit runs this file again, but now IN_STREAMLIT=1, so we skip this block
# and fall through to the actual app code below
import os
import sys
import subprocess
if __name__ == "__main__" and os.environ.get("IN_STREAMLIT") != "1":
# -- Preflight Checks -----------------------------------------------------
# Before starting the app, verify we have a valid OpenAI API key and that
# the required models are accessible. This catches configuration errors
# early with clear messages instead of cryptic runtime failures.
#
# Required models:
# - text-embedding-ada-002 (embedding model for vectorizing document chunks and queries)
# - gpt-4.1-mini (metadata extraction, follow-up question generation)
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("ERROR: OPENAI_API_KEY environment variable is not set.")
print("Export it before running: export OPENAI_API_KEY='sk-...'")
raise SystemExit(1)
try:
from openai import OpenAI
client = OpenAI(api_key=api_key)
# client.models.list() calls the OpenAI API to get all models your key can access
available = {m.id for m in client.models.list()}
required = {"text-embedding-ada-002", "gpt-4.1-mini"}
missing = required - available
if missing:
print(f"ERROR: Required model(s) not available: {', '.join(sorted(missing))}")
print("Check your OpenAI plan/access and try again.")
raise SystemExit(1)
print("Preflight OK — API key valid, required models available.")
except SystemExit:
raise
except Exception as e:
print(f"ERROR: Failed to verify OpenAI API access — {e}")
raise SystemExit(1)
# -- Streamlit Theme Configuration ----------------------------------------
# Create a .streamlit/config.toml file for the dark theme if it doesn't exist.
# Streamlit reads this file automatically on startup.
script_dir = os.path.dirname(os.path.abspath(__file__))
st_config_dir = os.path.join(script_dir, ".streamlit")
st_config_file = os.path.join(st_config_dir, "config.toml")
if not os.path.exists(st_config_file):
os.makedirs(st_config_dir, exist_ok=True)
with open(st_config_file, "w") as f:
f.write('[theme]\nbase="dark"\nprimaryColor="#d33682"\n'
'backgroundColor="#000000"\nsecondaryBackgroundColor="#586e75"\n'
'textColor="#fafafa"\nfont="sans serif"\n')
# -- Launch Streamlit -----------------------------------------------------
# subprocess.call() runs Streamlit as a child process and waits for it to exit.
# We pass --server.headless=true so it doesn't try to open a browser automatically.
# SystemExit propagates the child's exit code to the parent process.
env = os.environ.copy()
env["IN_STREAMLIT"] = "1"
raise SystemExit(
subprocess.call(
[sys.executable, "-m", "streamlit", "run", __file__,
"--server.port", "8501", "--server.headless", "true"],
env=env,
cwd=script_dir,
)
)
# =============================================================================
# 1.0 IMPORTS
# =============================================================================
# Everything below this point runs INSIDE Streamlit (the bootstrap above is skipped).
# Chroma: Open-source vector database. Stores document embeddings on disk and
# lets you do similarity search (find the most relevant documents for a query).
from langchain_chroma import Chroma
# ChatPromptTemplate: Defines structured prompts with system/human/AI message roles.
# MessagesPlaceholder: A slot in a prompt template where chat history gets injected.
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# Document: LangChain's standard container — holds page_content (text) + metadata (dict).
from langchain_core.documents import Document
# ChatOpenAI: Wrapper around OpenAI's chat completion API (gpt-4, gpt-4o, etc.).
# OpenAIEmbeddings: Wrapper around OpenAI's embedding API — converts text into vectors
# (lists of ~1536 numbers) that capture semantic meaning. Similar text → similar vectors.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# create_history_aware_retriever: Wraps a retriever so it reformulates the user's
# question using chat history BEFORE searching. This way "Tell me more about that"
# becomes "Tell me more about [the topic from the previous answer]".
# create_retrieval_chain: Combines a retriever + an LLM chain into a single pipeline:
# question → retrieve docs → pass docs + question to LLM → answer.
from langchain_classic.chains import create_history_aware_retriever, create_retrieval_chain
# create_stuff_documents_chain: The simplest document combination strategy — "stuff"
# all retrieved documents into the prompt context at once (vs. map-reduce or refine).
# Works well when chunks are small enough to fit in the context window.
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
# RunnableWithMessageHistory: Wraps any chain to automatically load/save chat history.
# Each invocation reads prior messages, appends the new exchange, and persists them.
from langchain_core.runnables.history import RunnableWithMessageHistory
# StreamlitChatMessageHistory: Stores chat messages in Streamlit's session_state.
# This means history survives re-runs (Streamlit re-executes the script on every
# interaction) but is lost when the browser tab closes.
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
# PyPDFLoader: Reads a PDF file and returns a list of Document objects (one per page).
from langchain_community.document_loaders import PyPDFLoader
# RecursiveCharacterTextSplitter: Splits long documents into smaller chunks for embedding.
# "Recursive" means it tries splitting on paragraphs first, then sentences, then words —
# preserving natural boundaries. from_tiktoken_encoder() counts tokens (not characters)
# so chunks respect the embedding model's token limits.
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Streamlit: The web framework that powers the UI. It re-runs this entire script from
# top to bottom on every user interaction (button click, text input, etc.).
# st.session_state persists data across these re-runs.
import streamlit as st
import os
import json
import re
import html as html_module # renamed to avoid collision with streamlit's html support
import tempfile
import shutil
import time
from pathlib import Path
# =============================================================================
# 2.0 CONFIGURATION
# =============================================================================
# Central place for all tunable parameters. Change these to experiment with
# different models, chunk sizes, or storage locations.
BASE_DIR = Path(__file__).parent
# Where the Chroma vector database is stored on disk.
# This directory contains the embedded chunks from the uploaded PDF.
VECTOR_DATABASE = str(BASE_DIR / "data" / "chroma_document.db")
# JSON file that persists document metadata between browser sessions.
# When you reload the page, the app reads this to skip re-uploading.
DOCUMENT_STATE_FILE = str(BASE_DIR / "data" / "document_state.json")
# EMBEDDING MODEL: text-embedding-ada-002
# - Outputs a 1536-dimensional vector for each text input
# - Used for BOTH indexing documents AND embedding user queries
# - CRITICAL: The same model must be used for indexing and querying,
# otherwise the vectors won't be in the same "space" and search fails
# - Resource: https://platform.openai.com/docs/models
EMBEDDING_MODEL = "text-embedding-ada-002"
# METADATA MODEL: Used for lightweight AI tasks (extracting document metadata,
# generating follow-up questions). A small, fast, cheap model works best here
# since these tasks don't need deep reasoning.
METADATA_MODEL = "gpt-4.1-mini"
# CHAT MODELS — Three tiers the user can select in the sidebar.
# Each tier offers a meaningful difference in quality/cost/speed:
#
# Standard models (accept temperature parameter):
# gpt-4o-mini — Cheapest & fastest. Good for simple Q&A. ~$0.15/$0.60 per 1M tokens.
# gpt-4.1-mini — Best balance of quality and cost. ~$0.40/$1.60 per 1M tokens. (DEFAULT)
# gpt-4.1 — Most capable standard model. ~$2.00/$8.00 per 1M tokens.
#
# Reasoning models (do NOT accept temperature — they self-regulate):
# o3-mini — Cheapest reasoning model. Good for basic analysis. ~$1.10/$4.40 per 1M tokens.
# o4-mini — Best balance for reasoning tasks. ~$1.10/$4.40 per 1M tokens. (DEFAULT)
# o3 — Deepest reasoning. Slow but thorough. ~$2.00/$8.00 per 1M tokens.
STANDARD_MODELS = ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1"]
REASONING_MODELS = ["o3-mini", "o4-mini", "o3"]
# CHUNKING PARAMETERS:
# CHUNK_SIZE: Target size of each text chunk in tokens (not characters).
# Smaller chunks = more precise retrieval but less context per chunk.
# Larger chunks = more context but may include irrelevant text.
# 500 tokens is a good default (~375 words, ~1/3 of a page).
# CHUNK_OVERLAP: How many tokens overlap between consecutive chunks.
# This prevents information from being split across chunk boundaries.
# 50 tokens of overlap means the end of chunk N appears at the start of chunk N+1.
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# =============================================================================
# 3.0 PAGE SETUP & STYLING
# =============================================================================
# Streamlit page configuration must be the first st.* call in the script.
# layout="wide" uses the full browser width instead of a narrow centered column.
st.set_page_config(
page_title="Document Assistant",
page_icon="docs",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS injected via st.markdown with unsafe_allow_html=True.
# Streamlit's default styling is functional but generic — this CSS gives the app
# a polished, professional look with a dark-themed header, styled source cards, etc.
st.markdown("""
<style>
/* Layout */
.block-container { max-width: 1400px; padding-top: 1rem; }
/* Header bar — gradient background with accent border */
.app-header {
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
color: #e8e8e8;
padding: 20px 28px;
border-radius: 8px;
margin-bottom: 4px;
border-left: 4px solid #4a90d9;
}
.app-header h1 { margin: 0; font-size: 1.6rem; font-weight: 600; }
.app-header p { margin: 4px 0 0 0; opacity: 0.85; font-size: 0.9rem; }
/* Document status row — shows filename, chunk count, model */
.doc-status-row {
display: flex;
flex-wrap: wrap;
gap: 20px;
padding: 10px 16px;
background: #f0f4f8;
border-radius: 6px;
margin-bottom: 16px;
font-size: 0.82rem;
color: #495057;
border-left: 3px solid #4a90d9;
}
.doc-status-row .status-item { white-space: nowrap; }
.doc-status-row .status-label { color: #6c757d; margin-right: 4px; }
/* Question chips — clickable suggested/follow-up questions */
.chip-container { display: flex; flex-wrap: wrap; gap: 8px; margin: 8px 0 16px 0; }
.q-chip {
display: inline-block;
background: #e9ecef;
color: #343a40;
padding: 6px 14px;
border-radius: 20px;
font-size: 0.82rem;
cursor: pointer;
border: 1px solid #dee2e6;
transition: all 0.15s ease;
}
.q-chip:hover { background: #4a90d9; color: white; border-color: #4a90d9; }
/* Source card — displays retrieved chunk content */
.source-card {
background: #f8f9fa;
border-left: 3px solid #4a90d9;
border-radius: 6px;
padding: 10px 14px;
margin-bottom: 10px;
font-size: 0.82rem;
line-height: 1.5;
}
.source-card .source-meta {
color: #6c757d;
font-size: 0.75rem;
margin-bottom: 4px;
}
/* Processing log — terminal-style output during PDF indexing */
.process-log {
background: #1a1a2e;
color: #a8d8a8;
border-radius: 6px;
padding: 12px 16px;
font-family: 'Menlo', 'Consolas', monospace;
font-size: 0.78rem;
line-height: 1.8;
max-height: 180px;
overflow-y: auto;
}
/* Info card on upload page */
.info-card {
background: #f8f9fa;
border-radius: 8px;
padding: 14px 18px;
margin: 8px 0;
border-left: 3px solid #4a90d9;
font-size: 0.88rem;
}
/* Sidebar compact */
section[data-testid="stSidebar"] { width: 280px !important; }
section[data-testid="stSidebar"] .block-container { padding-top: 1rem; }
/* Footer */
.app-footer {
text-align: center; color: #6c757d; font-size: 0.75rem;
padding: 12px 0; border-top: 1px solid #dee2e6; margin-top: 24px;
}
/* Highlighted search terms in source cards */
.source-card mark {
background: #fff3cd;
padding: 1px 3px;
border-radius: 2px;
font-weight: 500;
}
/* Source summary bar — "Retrieved N chunks covering pages X, Y, Z" */
.source-summary {
background: #e8f4f8;
border-radius: 6px;
padding: 8px 12px;
margin-bottom: 12px;
font-size: 0.78rem;
color: #495057;
}
/* Low confidence warning */
.confidence-warning {
background: #fff3cd;
border-left: 3px solid #ffc107;
border-radius: 6px;
padding: 10px 14px;
margin: 8px 0;
font-size: 0.82rem;
color: #856404;
}
/* Hide default Streamlit branding */
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
""", unsafe_allow_html=True)
# =============================================================================
# 4.0 SESSION STATE
# =============================================================================
# Streamlit re-runs the entire script on every interaction. Session state is how
# we persist data across those re-runs. Think of it as a per-user dictionary that
# survives button clicks and text input but resets when the browser tab closes.
#
# DEFAULTS defines every key we use and its initial value. On each re-run, we
# only set keys that don't exist yet (preserving values from previous interactions).
DEFAULTS = {
"document_ready": False, # True once a PDF has been indexed
"document_metadata": None, # AI-extracted: title, topics, questions, etc.
"uploaded_filename": None, # Original filename of the uploaded PDF
"uploaded_filesize": None, # File size in bytes
"chunk_count": None, # Number of chunks in the vector store
"page_count": None, # Number of pages in the PDF
"last_sources": [], # Retrieved documents for the latest answer
"last_query": None, # Last user question (for highlighting terms)
"chip_question": None, # Set when user clicks a suggested question chip
"followup_questions": [], # AI-generated follow-up questions after each answer
}
for key, val in DEFAULTS.items():
if key not in st.session_state:
st.session_state[key] = val
# -- Session Restore ----------------------------------------------------------
# WHY? Without this, refreshing the browser would lose the loaded document and
# force re-uploading. We persist document metadata to a JSON file on disk (alongside
# the Chroma vector DB). On startup, if both exist, we restore session state so
# the user goes straight to the chat screen.
if (
not st.session_state.document_ready
and os.path.exists(VECTOR_DATABASE)
and os.path.exists(DOCUMENT_STATE_FILE)
):
try:
with open(DOCUMENT_STATE_FILE) as f:
saved = json.load(f)
st.session_state.document_ready = True
st.session_state.document_metadata = saved.get("document_metadata")
st.session_state.uploaded_filename = saved.get("uploaded_filename")
st.session_state.uploaded_filesize = saved.get("uploaded_filesize")
st.session_state.chunk_count = saved.get("chunk_count")
st.session_state.page_count = saved.get("page_count")
except Exception:
pass # corrupt file — fall through to upload screen
# =============================================================================
# 5.0 CORE FUNCTIONS — RAG Pipeline
# =============================================================================
# These functions implement the RAG pipeline:
# get_embedding_function() → creates the embedding model (cached)
# get_vectorstore() → connects to the Chroma vector database
# extract_document_metadata() → uses AI to extract title, topics, questions from PDF
# process_uploaded_pdf() → full ingestion pipeline (load → chunk → embed → store)
# create_rag_chain() → assembles the retrieval + generation chain
# generate_followup_questions() → generates contextual follow-up suggestions
# highlight_terms() → highlights search terms in source text
# reset_document() → clears everything for a fresh upload
# * 5.1 Embedding Function (Cached)
# -----------------------------------
# @st.cache_resource means this function runs ONCE and the result is reused across
# all re-runs and all users. This avoids creating a new OpenAIEmbeddings object
# (and its HTTP connection pool) on every single Streamlit re-run.
#
# CRITICAL: The same embedding model must be used for BOTH indexing and querying.
@st.cache_resource(show_spinner=False)
def get_embedding_function():
return OpenAIEmbeddings(model=EMBEDDING_MODEL)
# * 5.2 Vector Store Connection
# ------------------------------
# Connects to an existing Chroma database on disk. Does NOT create a new one —
# that happens in process_uploaded_pdf(). Returns None if no document is loaded.
def get_vectorstore():
if not st.session_state.document_ready:
return None
return Chroma(
persist_directory=VECTOR_DATABASE,
embedding_function=get_embedding_function()
)
# * 5.3 AI-Powered Metadata Extraction
# --------------------------------------
# WHY? Instead of showing the raw filename as the app title, we use a small LLM
# to analyze the first few pages and extract structured metadata: a professional
# title, topics, suggested questions, and a welcome message. This makes the chat
# experience feel tailored to each document.
#
# HOW: We take the first ~4000 chars from the first 5 pages, send them to
# gpt-4.1-mini with a structured prompt, and parse the JSON response.
# If anything fails, we fall back to sensible defaults derived from the filename.
def extract_document_metadata(documents: list, filename: str) -> dict:
sample_content = ""
for doc in documents[:5]:
sample_content += doc.page_content[:1000] + "\n\n"
sample_content = sample_content[:4000]
llm = ChatOpenAI(model=METADATA_MODEL, temperature=0.3)
prompt = ChatPromptTemplate.from_template("""Analyze this document excerpt and extract metadata.
Document filename: {filename}
Content sample:
{content}
Return a JSON object with these exact keys:
{{
"title": "A concise, professional title for this document (max 50 chars)",
"subtitle": "A brief description of what this document covers (max 100 chars)",
"topics": ["topic1", "topic2", ...], // 8-15 key topics/concepts covered
"chapters": ["Chapter 1: Title", "Chapter 2: Title", ...], // Main sections/chapters if identifiable from TOC or headings. Empty list if none.
"example_questions": [
"Question 1 that a reader might ask about this content",
"Question 2...",
...
], // 6-8 diverse example questions
"welcome_message": "A brief welcome message for a chat assistant (max 150 chars)"
}}
Return ONLY valid JSON, no other text.""")
try:
response = llm.invoke(prompt.format(filename=filename, content=sample_content))
# Strip markdown code fences if the LLM wraps the JSON in ```json ... ```
json_str = response.content.strip()
if json_str.startswith("```"):
json_str = json_str.split("```")[1]
if json_str.startswith("json"):
json_str = json_str[4:]
return json.loads(json_str)
except Exception:
# Fallback: derive basic metadata from the filename
return {
"title": filename.replace(".pdf", "").replace("_", " ").title(),
"subtitle": "Document Reference Assistant",
"topics": ["Overview", "Key Concepts", "Details", "Summary"],
"example_questions": [
"What is the main topic of this document?",
"Can you summarize the key points?",
"What are the important concepts covered?",
"How does this relate to practical applications?"
],
"welcome_message": f"I can answer questions about {filename}. What would you like to know?"
}
# * 5.4 PDF Ingestion Pipeline
# ------------------------------
# This is the main ingestion function. It takes a raw uploaded PDF and produces
# a fully searchable vector store. The pipeline:
#
# 1. Save uploaded bytes to a temp file (PyPDFLoader needs a file path)
# 2. Extract text from every page → list of Document objects
# 3. AI metadata extraction (title, topics, questions)
# 4. Token-based chunking (split pages into ~500 token chunks)
# 5. Create a TOC document (synthetic document with document overview)
# 6. Generate embeddings via OpenAI and store in Chroma
# 7. Persist document state to disk for session restore
#
# The log_container and progress_bar provide real-time feedback in the UI.
def process_uploaded_pdf(uploaded_file, log_container, progress_bar) -> tuple[bool, str]:
logs = []
def log(msg, pct):
logs.append(msg)
log_container.markdown(
'<div class="process-log">' + "<br>".join(f"&gt; {l}" for l in logs) + "</div>",
unsafe_allow_html=True
)
progress_bar.progress(pct)
try:
# Step 1: Save to temp file
log("Saving uploaded file...", 0.05)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(uploaded_file.getvalue())
tmp_path = tmp.name
# Step 2: Extract text from PDF
# PyPDFLoader reads each page into a Document with page_content and metadata (page number).
log("Extracting text from PDF...", 0.15)
loader = PyPDFLoader(tmp_path)
documents = loader.load()
if not documents:
return False, "No content could be extracted from the PDF."
log(f" Found {len(documents)} pages", 0.20)
# Step 3: AI metadata extraction
log("Analyzing document with AI...", 0.25)
metadata = extract_document_metadata(documents, uploaded_file.name)
log(f' Title: "{metadata.get("title", "?")}"', 0.35)
# Step 4: Token-based chunking
# WHY tokens, not characters? LLMs and embedding models have TOKEN limits.
# 500 tokens ≈ 375 words ≈ 1/3 of a page. Overlap of 50 tokens ensures
# information at chunk boundaries isn't lost.
log("Token-based chunking...", 0.40)
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
model_name=EMBEDDING_MODEL,
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
)
chunks = splitter.split_documents(documents)
for i, chunk in enumerate(chunks):
chunk.metadata["source_file"] = uploaded_file.name
chunk.metadata["chunk_index"] = i
log(f" {len(chunks)} chunks created ({CHUNK_SIZE} tokens each)", 0.50)
# Step 5: Create a synthetic TOC document
# WHY? When users ask broad questions like "What is this document about?",
# the individual chunks may not contain a good overview. This TOC document
# acts as a summary that the retriever can find for high-level questions.
toc_doc = Document(
page_content=(
f"Document: {metadata.get('title', uploaded_file.name)}\n"
f"Description: {metadata.get('subtitle', '')}\n"
f"Pages: {len(documents)}, Chunks: {len(chunks)}\n"
f"Topics: {', '.join(metadata.get('topics', []))}"
),
metadata={"source": "Table of Contents", "source_file": uploaded_file.name}
)
chunks.insert(0, toc_doc)
# Step 6: Generate embeddings and store in Chroma
# Chroma.from_documents() sends all chunk texts to OpenAI's embedding API,
# gets back vectors, and stores everything in the persist_directory folder.
# If a previous database exists, we close its connections and remove it first.
log("Generating embeddings & indexing...", 0.55)
if os.path.exists(VECTOR_DATABASE):
try:
old_store = Chroma(
persist_directory=VECTOR_DATABASE,
embedding_function=get_embedding_function()
)
if hasattr(old_store, "_client"):
old_store._client.clear_system_cache()
del old_store
except Exception:
pass
shutil.rmtree(VECTOR_DATABASE, ignore_errors=True)
Chroma.from_documents(
chunks,
embedding=get_embedding_function(),
persist_directory=VECTOR_DATABASE
)
log(" Vector store created", 0.90)
os.unlink(tmp_path)
# Step 7: Persist to session state (for this browser session) and
# to disk (for surviving page refreshes / new browser sessions)
st.session_state.document_ready = True
st.session_state.uploaded_filename = uploaded_file.name
st.session_state.uploaded_filesize = uploaded_file.size
st.session_state.document_metadata = metadata
st.session_state.chunk_count = len(chunks)
st.session_state.page_count = len(documents)
if "langchain_messages" in st.session_state:
st.session_state.langchain_messages = []
with open(DOCUMENT_STATE_FILE, "w") as f:
json.dump({
"uploaded_filename": uploaded_file.name,
"uploaded_filesize": uploaded_file.size,
"document_metadata": metadata,
"chunk_count": len(chunks),
"page_count": len(documents),
}, f)
log("Done.", 1.0)
return True, f"{len(documents)} pages | {len(chunks)} chunks indexed"
except Exception as e:
return False, f"Error: {e}"
# * 5.5 RAG Chain Assembly
# --------------------------
# This is the heart of the application — it assembles the full RAG pipeline:
#
# User Question
# ↓
# [History-Aware Retriever]
# Uses chat history to reformulate the question into a standalone query.
# Example: "Tell me more" → "Tell me more about vector embeddings"
# Then searches Chroma for the k most relevant chunks.
# ↓
# [Stuff Documents Chain]
# Takes the retrieved chunks and "stuffs" them into the system prompt
# as context for the LLM.
# ↓
# [LLM (ChatOpenAI)]
# Generates an answer using the context + question + chat history.
# ↓
# [RunnableWithMessageHistory]
# Automatically loads/saves chat history to StreamlitChatMessageHistory.
# This wraps everything so history management is transparent.
#
# Two modes:
# Standard: Fast, concise answers with 4 retrieved chunks
# Reasoning: Deeper analysis with 6 retrieved chunks (uses reasoning models like o3)
def create_rag_chain(model_name: str, temp: float = None, is_reasoning: bool = False):
vectorstore = get_vectorstore()
if vectorstore is None:
return None, None
metadata = st.session_state.document_metadata or {}
source_name = metadata.get("title", st.session_state.uploaded_filename or "Document")
# Retrieve more chunks for reasoning mode (deeper analysis needs more context)
k_docs = 6 if is_reasoning else 4
retriever = vectorstore.as_retriever(search_kwargs={"k": k_docs})
# Reasoning models (o3, o4-mini) don't accept a temperature parameter
llm = ChatOpenAI(model=model_name) if is_reasoning else ChatOpenAI(model=model_name, temperature=temp)
# -- Stage 1: Question Reformulation --------------------------------------
# WHY? In a multi-turn conversation, the user might say "Tell me more about that".
# The retriever needs a standalone question to search effectively. This prompt
# asks the LLM to reformulate the question using chat history context.
contextualize_q_prompt = ChatPromptTemplate.from_messages([
("system", "Given a chat history and the latest user question, "
"formulate a standalone question. Do NOT answer it, just reformulate if needed."),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
])
msgs = StreamlitChatMessageHistory(key="langchain_messages")
history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
# -- Stage 2: Answer Generation -------------------------------------------
# The system prompt tells the LLM its role, injects retrieved context via {context},
# and instructs it to cite page numbers using [p.X] format.
mode_text = (
"Provide clear, concise answers. "
"If the user asks you to summarize, find, or explain, follow their instruction naturally."
)
if is_reasoning:
mode_text = (
"ANALYSIS MODE: Provide deep, thorough explanations. "
"Connect concepts across sections, provide examples, explain underlying principles. "
"If the user asks you to summarize, find, or explain, follow their instruction naturally."
)
citation_instruction = (
"IMPORTANT: Include page references in your answer using [p.X] format "
"(where X is the page number from the source metadata). "
"Place citations at the end of sentences that reference specific information. "
"If the retrieved context does not contain enough information to answer confidently, "
"clearly state that the answer may not be fully covered in the document."
)
qa_prompt = ChatPromptTemplate.from_messages([
("system",
f"You are an expert assistant for: {source_name}\n\n"
"{context}\n\n"
f"{mode_text}\n\n"
f"{citation_instruction}\n"
"Use markdown formatting for readability."),
MessagesPlaceholder("chat_history"),
("human", "{input}")
])
# -- Stage 3: Chain Assembly ----------------------------------------------
# create_stuff_documents_chain: LLM + prompt that expects {context} filled with docs
# create_retrieval_chain: retriever → stuff chain (auto-fills {context} with results)
# RunnableWithMessageHistory: wraps everything with automatic chat history
qa_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)
chain_with_history = RunnableWithMessageHistory(
rag_chain,
lambda session_id: msgs,
input_messages_key="input",
history_messages_key="chat_history",
output_messages_key="answer",
)
return chain_with_history, retriever
# * 5.6 Source Text Highlighting
# --------------------------------
# When displaying retrieved source chunks, we highlight words from the user's
# query so they can quickly see why each chunk was relevant.
# Common stop words are excluded to avoid highlighting "what", "the", etc.
def highlight_terms(text: str, query: str | None) -> str:
text = html_module.escape(text)
text = text.replace("\n", "<br>")
if not query:
return text
stop_words = {
"what", "where", "when", "which", "that", "this", "with", "from",
"have", "does", "about", "how", "the", "and", "for", "are", "but",
"not", "you", "all", "can", "was", "one", "our", "out", "into",
}
words = [w for w in query.split() if len(w) > 3 and w.lower() not in stop_words]
for word in words:
pattern = re.compile(f"({re.escape(word)})", re.IGNORECASE)
text = pattern.sub(r"<mark>\1</mark>", text)
return text
# * 5.7 Follow-Up Question Generation
# --------------------------------------
# WHY? After each answer, we generate contextual follow-up questions so the user
# doesn't have to think of what to ask next. This creates a guided exploration
# experience — the AI suggests logical next questions based on what was just discussed.
#
# HOW: Send the Q&A pair to a small LLM and ask for 4 follow-up questions as JSON.
def generate_followup_questions(question: str, answer: str) -> list[str]:
llm = ChatOpenAI(model=METADATA_MODEL, temperature=0.5)
prompt = ChatPromptTemplate.from_template(
"Based on this Q&A exchange, suggest 4 brief follow-up questions the user might ask next.\n\n"
"Question: {question}\n\n"
"Answer: {answer}\n\n"
"Return ONLY a JSON array of 4 short question strings, no other text."
)
try:
response = llm.invoke(prompt.format(question=question, answer=answer[:2000]))
text = response.content.strip()
if text.startswith("```"):
text = text.split("```")[1]
if text.startswith("json"):
text = text[4:]
return json.loads(text)[:4]
except Exception:
return []
# * 5.8 Document Reset
# ----------------------
# Clears all session state, chat history, the vector database, and the persisted
# state file. Used when the user clicks "New Document" to start fresh.
def reset_document():
for key in DEFAULTS:
st.session_state[key] = DEFAULTS[key]
if "langchain_messages" in st.session_state:
st.session_state.langchain_messages = []
# Close Chroma's SQLite connection before deleting the database directory,
# otherwise the OS may refuse to remove locked files.
if os.path.exists(VECTOR_DATABASE):
try:
old_store = Chroma(
persist_directory=VECTOR_DATABASE,
embedding_function=get_embedding_function()
)
if hasattr(old_store, "_client"):
old_store._client.clear_system_cache()
del old_store
except Exception:
pass
shutil.rmtree(VECTOR_DATABASE, ignore_errors=True)
if os.path.exists(DOCUMENT_STATE_FILE):
os.remove(DOCUMENT_STATE_FILE)
# =============================================================================
# 6.0 MAIN APPLICATION — Two-Stage UI
# =============================================================================
# The app has two stages:
# Stage 1 (Upload): Shown when no document is loaded. User uploads a PDF,
# which is automatically processed and indexed.
# Stage 2 (Chat): Shown after a document is indexed. Two-column layout with
# chat on the left and retrieved sources on the right.
#
# Streamlit re-runs this entire script on every interaction. The if/else below
# controls which stage is displayed based on session state.
# ─────────────────────────────────────────────────────────────────────────────
# STAGE 1: UPLOAD & INDEX
# ─────────────────────────────────────────────────────────────────────────────
if not st.session_state.document_ready:
st.markdown("""
<div class="app-header" style="text-align:center;">
<h1>Document Reference Assistant</h1>
</div>
""", unsafe_allow_html=True)
# Center the upload widget using a 3-column layout (empty | content | empty)
col_l, col_c, col_r = st.columns([1, 2, 1])
with col_c:
st.markdown("<h4 style='text-align:center;'>Add a Document</h4>", unsafe_allow_html=True)
uploaded_file = st.file_uploader(
"Choose a PDF file", type=["pdf"],
help="PDF documents up to 200 MB",
label_visibility="collapsed"
)
# Auto-process: as soon as a file is selected, start the ingestion pipeline.
# No "Process" button needed — reduces friction.
if uploaded_file is not None:
progress_bar = st.progress(0)
log_area = st.empty()
success, message = process_uploaded_pdf(uploaded_file, log_area, progress_bar)
if success:
st.success(message)
time.sleep(0.6)
st.rerun() # Trigger a re-run to switch to Stage 2
else:
st.error(message)
else:
st.caption("Drag-and-drop or browse to select a PDF file.")
# ─────────────────────────────────────────────────────────────────────────────
# STAGE 2: CHAT WITH SOURCES
# ─────────────────────────────────────────────────────────────────────────────
else:
# Extract metadata for display throughout the chat UI
meta = st.session_state.document_metadata or {}
title = meta.get("title", "Document Reference")
subtitle = meta.get("subtitle", "")
topics = meta.get("topics", [])
chapters = meta.get("chapters", [])
example_questions = meta.get("example_questions", [])
welcome = meta.get("welcome_message", "How can I help you?")
# -- Sidebar: Document Info + Settings ------------------------------------
# The sidebar shows the current document, model settings, and topic pills.
with st.sidebar:
st.markdown(f"**{st.session_state.uploaded_filename}**")
size_mb = (st.session_state.uploaded_filesize or 0) / 1024 / 1024
st.caption(
f"{size_mb:.1f} MB | "
f"{st.session_state.page_count or '?'} pages | "
f"{st.session_state.chunk_count or '?'} chunks"
)
if st.button("New Document", use_container_width=True):
reset_document()
st.rerun()
st.divider()
# Model selection — Standard mode allows temperature control,
# Reasoning mode uses models like o3/o4-mini that don't accept temperature.
with st.expander("Settings", expanded=False):
model_type = st.radio("Mode", ["Standard", "Reasoning"], index=0,
help="Reasoning = deeper analysis, slower")
if model_type == "Standard":
selected_model = st.selectbox("Model", STANDARD_MODELS, index=1)
temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1)
else:
selected_model = st.selectbox("Model", REASONING_MODELS, index=1)
temperature = None
# Fallback defaults if the Settings expander was never opened
if "model_type" not in dir():
model_type = "Standard"
selected_model = STANDARD_MODELS[0]
temperature = 0.3
st.divider()
if st.button("Clear Chat", use_container_width=True):
st.session_state.langchain_messages = []
st.session_state.last_sources = []
st.session_state.last_query = None
st.rerun()
st.divider()
# Topic pills — visual summary of document topics from AI metadata extraction
if topics:
st.markdown("**Topics**")
pills_html = " ".join(
f'<span style="display:inline-block;background:#e9ecef;color:#495057;'
f'padding:3px 10px;border-radius:14px;font-size:0.75rem;margin:2px;">'
f'{t}</span>'
for t in topics
)
st.markdown(pills_html, unsafe_allow_html=True)
# -- Header — Document title and subtitle ---------------------------------
st.markdown(
f'<div class="app-header"><h1>{title}</h1><p>{subtitle}</p></div>',
unsafe_allow_html=True
)
# -- Status Row — Compact bar showing document, chunk count, and model ----
is_reasoning = (model_type == "Reasoning")
status_items = [
f'<span class="status-item"><span class="status-label">Document:</span> {st.session_state.uploaded_filename}</span>',
f'<span class="status-item"><span class="status-label">Indexed:</span> {st.session_state.chunk_count or "?"} chunks</span>',
f'<span class="status-item"><span class="status-label">Model:</span> {selected_model}</span>',
]
st.markdown(
'<div class="doc-status-row">' + " ".join(status_items) + '</div>',
unsafe_allow_html=True
)
# -- Two-Column Layout: Chat (70%) | Sources (30%) ------------------------
chat_col, sources_col = st.columns([7, 3], gap="medium")
# ---- CHAT COLUMN --------------------------------------------------------
with chat_col:
# StreamlitChatMessageHistory stores messages in st.session_state["langchain_messages"].
# On first load, we seed it with the AI-generated welcome message.
msgs = StreamlitChatMessageHistory(key="langchain_messages")
if len(msgs.messages) == 0:
msgs.add_ai_message(welcome)
# Build the RAG chain for the currently selected model
rag_chain, retriever = create_rag_chain(
selected_model, temperature, is_reasoning
)
# Render the full chat history
for msg in msgs.messages:
role = "assistant" if msg.type == "ai" else "user"
st.chat_message(role).write(msg.content)
# -- Suggested / Follow-Up Question Chips -----------------------------
# Before any exchange: show static example questions from metadata.
# After an exchange: show AI-generated follow-up questions that are
# contextual to the last answer. Falls back to static examples if
# follow-up generation failed.
followups = st.session_state.followup_questions
if len(msgs.messages) <= 1:
if example_questions:
st.markdown("**Suggested questions**")
chip_cols = st.columns(min(len(example_questions), 4))
for idx, q in enumerate(example_questions):
col = chip_cols[idx % len(chip_cols)]
if col.button(q, key=f"chip_{idx}", use_container_width=True):
st.session_state.chip_question = q
st.rerun()
else:
chips = followups if followups else example_questions
if chips:
st.markdown("**Suggested questions**")
chip_cols = st.columns(min(len(chips), 4))
for idx, q in enumerate(chips):
col = chip_cols[idx % len(chip_cols)]
if col.button(q, key=f"followup_{idx}", use_container_width=True):
st.session_state.chip_question = q
st.rerun()
# -- Conversation Utility Buttons -------------------------------------
# Visible after at least one full Q&A exchange. These inject pre-written
# prompts as if the user typed them:
# Regenerate: re-asks the last question (removes the last Q&A pair first)
# More detail: asks for an expanded version of the last answer
# Shorter: asks for a condensed version
if len(msgs.messages) > 2:
u1, u2, u3, _ = st.columns([2, 2, 2, 6])
if u1.button("Regenerate", key="util_regen", use_container_width=True):
last_q = st.session_state.last_query
if last_q and len(st.session_state.get("langchain_messages", [])) >= 2:
st.session_state.langchain_messages.pop() # remove AI answer
st.session_state.langchain_messages.pop() # remove user question
st.session_state.chip_question = last_q
st.session_state.last_sources = []
st.rerun()
if u2.button("More detail", key="util_more", use_container_width=True):
st.session_state.chip_question = (
"Please expand on your previous answer with more detail and page citations."
)
st.rerun()
if u3.button("Shorter", key="util_shorter", use_container_width=True):
st.session_state.chip_question = (
"Please give a shorter, more concise version of your previous answer."
)
st.rerun()
# -- Question Input & RAG Invocation ----------------------------------
# st.chat_input provides the text box at the bottom of the chat.
# chip_question is set when the user clicks a suggested question button.
# Either source triggers the RAG chain.
typed_question = st.chat_input("Ask a question about the document...")
question = st.session_state.chip_question or typed_question
if st.session_state.chip_question:
st.session_state.chip_question = None # consume the chip question
if question:
st.chat_message("user").write(question)
st.session_state.last_query = question
spinner_text = "Analyzing..." if is_reasoning else "Searching..."
with st.spinner(spinner_text):
try:
if rag_chain:
# Invoke the full RAG pipeline:
# 1. Reformulate question using chat history
# 2. Retrieve relevant chunks from Chroma
# 3. Generate answer with retrieved context
# 4. Save Q&A to chat history automatically
response = rag_chain.invoke(
{"input": question},
config={"configurable": {"session_id": "doc_chat"}}
)
st.chat_message("assistant").write(response["answer"])
st.session_state.last_sources = response.get("context", [])
# Generate contextual follow-up questions based on this exchange
st.session_state.followup_questions = generate_followup_questions(
question, response["answer"]
)
# Rerun to display the new follow-up chips (they render above
# the input area, which has already been drawn by this point)
st.rerun()
else:
st.error("RAG chain not available. Try re-uploading the document.")
except Exception as e:
st.error(f"Error: {e}")
# ---- SOURCES COLUMN -----------------------------------------------------
# Shows the retrieved chunks that the LLM used to generate its answer.
# Each chunk is in a collapsible expander labeled with its page and chunk number.
# Search terms from the user's query are highlighted with <mark> tags.
with sources_col:
st.markdown("**Sources**")
sources = st.session_state.last_sources
if sources:
# Summary bar — shows how many chunks were retrieved and which pages
pages_cited = sorted(set(
str(d.metadata.get("page", "?"))
for d in sources
if d.metadata.get("page") is not None
))
pages_str = ", ".join(pages_cited) if pages_cited else "N/A"
st.markdown(
f'<div class="source-summary">'
f'Retrieved <strong>{len(sources)}</strong> chunks '
f'covering pages <strong>{pages_str}</strong></div>',
unsafe_allow_html=True,
)
for i, doc in enumerate(sources):
page = doc.metadata.get("page", "?")
chunk_idx = doc.metadata.get("chunk_index", "?")
with st.expander(f"Page {page} · Chunk {chunk_idx}"):
full_hl = highlight_terms(
doc.page_content, st.session_state.last_query
)
st.markdown(full_hl, unsafe_allow_html=True)
else:
st.caption("Sources for the latest answer will appear here.")
# =============================================================================
# 7.0 FOOTER
# =============================================================================
st.markdown(
'<div class="app-footer">Document Reference Assistant &mdash; '
'Built with LangChain + Streamlit &mdash; Business Science University</div>',
unsafe_allow_html=True
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment