AI Knowledge Base: Chat with Your Own Notes via RAG (2026)

Project Overview

A personal knowledge management system (second brain) that ingests notes, web articles, PDFs, and Markdown files. Auto-generates tags and links between related notes. Lets you chat with your entire knowledge base and surface forgotten connections.

Learning Outcomes

After completing this project you will be able to:

Build a multi-source ingestion pipeline that handles Markdown, PDF, and web URLs in a unified way
Use embeddings to auto-link related notes and explain why cosine similarity works for this
Implement semantic search over personal knowledge with source attribution in LLM answers
Design a hybrid retrieval system combining vector search (ChromaDB) with keyword search (SQLite FTS5)
Structure LLM-generated metadata (tags, summaries) for reliable downstream filtering
Create a Streamlit multi-tab app combining chat, ingestion, and browse interfaces

Tech Stack

Layer	Technology	Purpose
LLM	OpenAI gpt-4o-mini	Answer synthesis and metadata generation
Embeddings	text-embedding-3-small	Semantic similarity between notes
Vector DB	ChromaDB	Semantic search over note embeddings
Relational DB	SQLite (via sqlite-utils)	Note metadata, tags, and link graph
PDF parsing	pypdf	Extract text from uploaded PDFs
Web scraping	requests + BeautifulSoup	Ingest articles from URLs
UI	Streamlit	Chat, ingestion, and browse interface
Language	Python 3.11+	Core implementation

Architecture

plaintext

Sources: Markdown / PDF / URL / clipboard
        ↓
Ingest: extract text, generate metadata + tags
        ↓
Embed → ChromaDB vector store
        ↓ also
SQLite: note metadata, tags, bidirectional links
        ↓
Query: semantic search → related notes → LLM synthesis
        ↓
Streamlit UI: search, chat, note graph visualization

Implementation

Step 1: Setup

Bash

pip install openai chromadb pypdf requests beautifulsoup4 streamlit sqlite-utils tiktoken

Step 2: Note Schema + Database

Python

# database.py
import sqlite3
import json
from datetime import datetime
from contextlib import contextmanager

DB_PATH = "knowledge.db"


def init_db():
    with get_conn() as conn:
        conn.executescript("""
            CREATE TABLE IF NOT EXISTS notes (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT NOT NULL,
                content TEXT NOT NULL,
                source TEXT,
                source_url TEXT,
                tags TEXT DEFAULT '[]',
                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
                updated_at TEXT DEFAULT CURRENT_TIMESTAMP
            );
            CREATE TABLE IF NOT EXISTS links (
                from_id INTEGER,
                to_id INTEGER,
                link_type TEXT DEFAULT 'related',
                similarity REAL,
                PRIMARY KEY (from_id, to_id),
                FOREIGN KEY (from_id) REFERENCES notes(id),
                FOREIGN KEY (to_id) REFERENCES notes(id)
            );
            CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts
            USING fts5(title, content, content=notes, content_rowid=id);
        """)


@contextmanager
def get_conn():
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    try:
        yield conn
        conn.commit()
    finally:
        conn.close()


def save_note(title: str, content: str, source: str = "", url: str = "", tags: list = None) -> int:
    with get_conn() as conn:
        cursor = conn.execute(
            "INSERT INTO notes (title, content, source, source_url, tags) VALUES (?,?,?,?,?)",
            (title, content, source, url or "", json.dumps(tags or []))
        )
        note_id = cursor.lastrowid
        conn.execute("INSERT INTO notes_fts(rowid, title, content) VALUES (?,?,?)",
                     (note_id, title, content))
        return note_id


def get_note(note_id: int) -> dict:
    with get_conn() as conn:
        row = conn.execute("SELECT * FROM notes WHERE id=?", (note_id,)).fetchone()
        return dict(row) if row else None


def get_all_notes(limit: int = 500) -> list[dict]:
    with get_conn() as conn:
        rows = conn.execute(
            "SELECT id, title, source, tags, created_at FROM notes ORDER BY created_at DESC LIMIT ?",
            (limit,)
        ).fetchall()
        return [dict(r) for r in rows]


def save_link(from_id: int, to_id: int, similarity: float):
    with get_conn() as conn:
        conn.execute(
            "INSERT OR REPLACE INTO links (from_id, to_id, similarity) VALUES (?,?,?)",
            (from_id, to_id, similarity)
        )


def get_related_notes(note_id: int, limit: int = 5) -> list[dict]:
    with get_conn() as conn:
        rows = conn.execute("""
            SELECT n.id, n.title, l.similarity
            FROM links l JOIN notes n ON l.to_id = n.id
            WHERE l.from_id = ? ORDER BY l.similarity DESC LIMIT ?
        """, (note_id, limit)).fetchall()
        return [dict(r) for r in rows]


init_db()

Step 3: Ingestion Pipeline

Python

# ingestion.py
import json
import hashlib
import chromadb
from pathlib import Path
from openai import OpenAI
from database import save_note, save_link, get_all_notes

client = OpenAI()
chroma = chromadb.PersistentClient(path="./chroma_kb")
collection = chroma.get_or_create_collection("knowledge")

METADATA_PROMPT = """Analyze this note/document and return JSON:
{{
  "title": "concise descriptive title",
  "tags": ["tag1", "tag2", "tag3"],
  "summary": "2-3 sentence summary"
}}
Content: {content}"""


def generate_metadata(content: str) -> dict:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": METADATA_PROMPT.format(content=content[:3000])}],
        response_format={"type": "json_object"},
        temperature=0.3,
    )
    import json
    return json.loads(response.choices[0].message.content)


def embed(texts: list[str]) -> list[list[float]]:
    response = client.embeddings.create(model="text-embedding-3-small", input=texts)
    return [e.embedding for e in response.data]


def ingest_text(content: str, title: str = None, source: str = "text", url: str = "") -> int:
    metadata = generate_metadata(content)
    final_title = title or metadata["title"]
    tags = metadata.get("tags", [])

    note_id = save_note(final_title, content, source, url, tags)

    # Embed and store in vector DB
    doc_id = f"note_{note_id}"
    embedding = embed([content[:4000]])[0]
    collection.add(
        ids=[doc_id],
        embeddings=[embedding],
        documents=[content[:4000]],
        metadatas=[{"note_id": note_id, "title": final_title}],
    )

    # Auto-link to related notes
    find_and_save_links(note_id, content)

    print(f"Ingested: '{final_title}' (id={note_id}, tags={tags})")
    return note_id


def ingest_file(path: str) -> int:
    p = Path(path)
    if p.suffix == ".pdf":
        from pypdf import PdfReader
        content = "\n\n".join(pg.extract_text() or "" for pg in PdfReader(str(p)).pages)
    else:
        content = p.read_text(encoding="utf-8")
    return ingest_text(content, source=p.name)


def ingest_url(url: str) -> int:
    import requests
    from bs4 import BeautifulSoup
    resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["nav", "footer", "script", "style"]):
        tag.decompose()
    article = soup.find("article") or soup.find("main") or soup.body
    content = article.get_text(separator="\n") if article else ""
    return ingest_text(content, source="web", url=url)


def find_and_save_links(note_id: int, content: str, threshold: float = 0.75):
    """Find related notes and create bidirectional links."""
    embedding = embed([content[:2000]])[0]
    results = collection.query(
        query_embeddings=[embedding],
        n_results=6,
    )
    for doc, meta, distance in zip(
        results["documents"][0], results["metadatas"][0], results["distances"][0]
    ):
        related_id = meta.get("note_id")
        if related_id and related_id != note_id:
            similarity = 1 - distance
            if similarity >= threshold:
                save_link(note_id, related_id, similarity)
                save_link(related_id, note_id, similarity)

Step 4: Knowledge Chat

Python

# chat.py
from openai import OpenAI
from ingestion import embed, collection
from database import get_note, get_related_notes

client = OpenAI()

CHAT_PROMPT = """You are an AI assistant with access to a personal knowledge base.
Answer questions using the provided notes. Quote specific notes when relevant.
If information is not in the notes, say so.

Relevant notes:
{context}

Question: {question}"""


def search_knowledge(query: str, n: int = 6) -> list[dict]:
    embedding = embed([query])[0]
    results = collection.query(query_embeddings=[embedding], n_results=n)
    return [
        {"note_id": meta["note_id"], "title": meta["title"], "content": doc}
        for doc, meta in zip(results["documents"][0], results["metadatas"][0])
    ]


def chat_with_kb(question: str, history: list[dict]) -> dict:
    chunks = search_knowledge(question)
    context = "\n\n---\n\n".join(
        f"**{c['title']}**\n{c['content'][:800]}" for c in chunks
    )
    messages = [{"role": "user", "content": CHAT_PROMPT.format(context=context, question=question)}]
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=800,
        temperature=0.3,
    )
    return {
        "answer": response.choices[0].message.content,
        "sources": [{"id": c["note_id"], "title": c["title"]} for c in chunks],
    }

Step 5: Streamlit App

Python

# app.py
import streamlit as st
from ingestion import ingest_text, ingest_url, ingest_file
from chat import chat_with_kb, search_knowledge
from database import get_all_notes, get_note, get_related_notes
import tempfile, os

st.set_page_config(page_title="Knowledge Base", page_icon="🧠", layout="wide")
st.title("🧠 AI Personal Knowledge Base")

tab1, tab2, tab3 = st.tabs(["💬 Chat", "📥 Add Notes", "📚 Browse"])

with tab1:
    if "kb_history" not in st.session_state:
        st.session_state.kb_history = []

    for msg in st.session_state.kb_history:
        with st.chat_message(msg["role"]):
            st.write(msg["content"])
            if msg.get("sources"):
                st.caption("Sources: " + ", ".join(s["title"] for s in msg["sources"]))

    if prompt := st.chat_input("Ask your knowledge base..."):
        st.session_state.kb_history.append({"role": "user", "content": prompt})
        with st.spinner("Searching knowledge..."):
            result = chat_with_kb(prompt, st.session_state.kb_history)
        st.session_state.kb_history.append({
            "role": "assistant", "content": result["answer"], "sources": result["sources"]
        })
        st.rerun()

with tab2:
    mode = st.radio("Add from", ["Text/Markdown", "URL", "File"], horizontal=True)
    if mode == "Text/Markdown":
        title = st.text_input("Title (optional)")
        content = st.text_area("Content", height=300)
        if st.button("Add Note") and content:
            with st.spinner("Ingesting..."):
                note_id = ingest_text(content, title or None)
            st.success(f"Added note #{note_id}")
    elif mode == "URL":
        url = st.text_input("URL")
        if st.button("Fetch & Add") and url:
            with st.spinner("Fetching..."):
                note_id = ingest_url(url)
            st.success(f"Added note #{note_id}")
    else:
        uploaded = st.file_uploader("Upload PDF or text file", type=["pdf", "txt", "md"])
        if uploaded and st.button("Ingest File"):
            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded.name)[1]) as tmp:
                tmp.write(uploaded.read())
                tmp_path = tmp.name
            with st.spinner("Ingesting..."):
                note_id = ingest_file(tmp_path)
            os.unlink(tmp_path)
            st.success(f"Added note #{note_id}")

with tab3:
    notes = get_all_notes(100)
    st.write(f"**{len(notes)} notes** in your knowledge base")
    for note in notes:
        with st.expander(f"{note['title']}"):
            import json
            tags = json.loads(note.get("tags", "[]"))
            st.write(f"Source: {note['source']} | Tags: {', '.join(tags)}")
            related = get_related_notes(note["id"])
            if related:
                st.write("Related: " + ", ".join(f"**{r['title']}**" for r in related))

Step 6: Run

Bash

# Ingest existing notes directory
python -c "
from ingestion import ingest_file
from pathlib import Path
for f in Path('my_notes/').glob('*.md'):
    ingest_file(str(f))
"

# Launch app
streamlit run app.py

Extension Ideas

Obsidian sync — watch Obsidian vault for new/changed notes and auto-ingest
Browser extension — one-click web clipping to your knowledge base
Scheduled digests — weekly email of "forgotten notes" that are newly relevant
Note graph visualization — interactive graph of note connections with D3.js
Spaced repetition — surface notes for review based on forgetting curves

What to Learn Next

Multi-agent research → Multi-Agent Research System
RAG deep dive → RAG Document Assistant