Semantic Search: Build Search That Understands Meaning (2026)

M
Mamta Chauhan
Content Creator and AI Enthusiast

Keyword Search vs. Semantic Search

Keyword search matches exact words:

plaintext
Query: "car repair"
Finds: "car repair shops near me"Misses: "automobile maintenance" ✗  (same meaning, different words)

Semantic search matches meaning:

plaintext
Query: "car repair"
Finds: "automobile maintenance"Finds: "vehicle service center"Finds: "fix my broken engine"

This is possible because embeddings encode meaning as vectors, and similar meanings produce similar vectors.


How Embeddings Work

An embedding model converts text → a dense vector of floats:

Python
from openai import OpenAI
import numpy as np

client = OpenAI()

def embed(text: str) -> np.ndarray:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text,
    )
    return np.array(response.data[0].embedding)

# These phrases have different words but similar meanings
cat_emb    = embed("cat")
feline_emb = embed("feline")
dog_emb    = embed("dog")
code_emb   = embed("Python programming language")

# Cosine similarity: how alike are two vectors?
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

print(f"cat ↔ feline: {cosine_sim(cat_emb, feline_emb):.3f}")   # 0.91 (very similar)
print(f"cat ↔ dog:    {cosine_sim(cat_emb, dog_emb):.3f}")       # 0.82 (related)
print(f"cat ↔ code:   {cosine_sim(cat_emb, code_emb):.3f}")      # 0.17 (unrelated)

The 1536-dimensional embedding space geometrically encodes semantic relationships.


Building a Basic Semantic Search Engine

Python
import numpy as np
from openai import OpenAI
from dataclasses import dataclass

client = OpenAI()


@dataclass
class Document:
    id: str
    text: str
    metadata: dict


class SemanticSearchEngine:
    def __init__(self, model: str = "text-embedding-3-small"):
        self.model = model
        self.documents: list[Document] = []
        self.embeddings: np.ndarray | None = None

    def embed_batch(self, texts: list[str]) -> np.ndarray:
        """Embed multiple texts efficiently in one API call."""
        response = client.embeddings.create(model=self.model, input=texts)
        return np.array([item.embedding for item in response.data])

    def add_documents(self, docs: list[Document], batch_size: int = 100):
        """Add documents and compute their embeddings."""
        self.documents.extend(docs)
        texts = [doc.text for doc in docs]

        new_embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            embs = self.embed_batch(batch)
            new_embeddings.append(embs)
            print(f"  Embedded {min(i + batch_size, len(texts))}/{len(texts)}")

        new_embs = np.vstack(new_embeddings)

        # Normalize for cosine similarity (dot product of normalized = cosine)
        norms = np.linalg.norm(new_embs, axis=1, keepdims=True)
        new_embs = new_embs / norms

        if self.embeddings is None:
            self.embeddings = new_embs
        else:
            self.embeddings = np.vstack([self.embeddings, new_embs])

    def search(self, query: str, top_k: int = 5, threshold: float = 0.5) -> list[dict]:
        """Find documents most semantically similar to query."""
        if self.embeddings is None:
            return []

        # Embed and normalize query
        q_emb = self.embed_batch([query])[0]
        q_emb = q_emb / np.linalg.norm(q_emb)

        # Cosine similarities (dot product of normalized vectors)
        similarities = self.embeddings @ q_emb

        # Sort and filter
        top_indices = np.argsort(similarities)[::-1][:top_k]
        results = []
        for idx in top_indices:
            sim = float(similarities[idx])
            if sim >= threshold:
                results.append({
                    "document": self.documents[idx],
                    "score": round(sim, 4),
                })
        return results


# Example usage
engine = SemanticSearchEngine()

docs = [
    Document("1", "How to train a machine learning model with scikit-learn", {"category": "ml"}),
    Document("2", "Introduction to neural networks and deep learning", {"category": "dl"}),
    Document("3", "Building REST APIs with FastAPI and Python", {"category": "backend"}),
    Document("4", "Kubernetes deployment strategies for microservices", {"category": "devops"}),
    Document("5", "Fine-tuning LLMs with LoRA for custom tasks", {"category": "llm"}),
    Document("6", "Vector databases and approximate nearest neighbor search", {"category": "rag"}),
    Document("7", "GPT-4 API integration in Python applications", {"category": "llm"}),
    Document("8", "Gradient descent optimization algorithms explained", {"category": "ml"}),
]

engine.add_documents(docs)

# Search — even if query uses different words from documents
results = engine.search("teaching computers to learn from data")
for r in results:
    print(f"{r['score']:.3f}: {r['document'].text}")

# Expected top results: ml/dl articles, not kubernetes or FastAPI

Production: Approximate Nearest Neighbor (ANN) Search

The naive approach (O(n * d) dot products) works for thousands of documents. For millions, you need Approximate Nearest Neighbor (ANN) indexes.

FAISS (Facebook AI Similarity Search)

Bash
pip install faiss-cpu  # or faiss-gpu for NVIDIA GPU
Python
import faiss
import numpy as np

# Build a FAISS index
d = 1536  # embedding dimension

# Flat index: exact search (good up to ~100k)
index_flat = faiss.IndexFlatIP(d)  # Inner Product = cosine similarity (with normalized vectors)

# IVF index: approximate search (millions of vectors)
quantizer = faiss.IndexFlatIP(d)
index_ivf = faiss.IndexIVFFlat(quantizer, d, 100)  # 100 clusters

# HNSW: best recall/speed tradeoff (recommended for most use cases)
index_hnsw = faiss.IndexHNSWFlat(d, 32)  # 32 = M parameter (connections per node)
index_hnsw.hnsw.efConstruction = 40  # build-time parameter

# Add vectors
embeddings = np.random.randn(100000, d).astype(np.float32)
faiss.normalize_L2(embeddings)  # normalize for cosine similarity

index_hnsw.add(embeddings)
print(f"Index contains {index_hnsw.ntotal} vectors")

# Search
query = np.random.randn(1, d).astype(np.float32)
faiss.normalize_L2(query)

k = 10  # return top 10
distances, indices = index_hnsw.search(query, k)
print(f"Top match index: {indices[0][0]}, similarity: {distances[0][0]:.4f}")

# Save and load
faiss.write_index(index_hnsw, "search_index.bin")
loaded_index = faiss.read_index("search_index.bin")

HNSW Explained

HNSW (Hierarchical Navigable Small World) is a graph-based index:

plaintext
Layer 2 (sparse):  A -------- F
                   |          |
Layer 1:           A --- C -- F --- H
                   |    |    |     |
Layer 0 (dense):   A-B-C-D-E-F-G-H-I-J

Search: start at top layer, greedily navigate to nearest neighbor, descend to lower layers. O(log n) instead of O(n).

Key parameters:

  • M (16–64): connections per node. Higher = better recall, more memory.
  • efConstruction (100–800): build-time search width. Higher = better quality, slower build.
  • ef (50–500): search-time width. Higher = better recall, slower search.

ChromaDB: Full-Featured Vector Database

Bash
pip install chromadb
Python
import chromadb
from chromadb.utils import embedding_functions

# Use OpenAI embeddings automatically
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="your-key",
    model_name="text-embedding-3-small",
)

client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(
    "articles",
    embedding_function=openai_ef,
    metadata={"hnsw:space": "cosine"},  # use cosine similarity
)

# Add documents (auto-embeds)
collection.add(
    ids=["doc1", "doc2", "doc3"],
    documents=["Text of doc 1", "Text of doc 2", "Text of doc 3"],
    metadatas=[{"category": "ml"}, {"category": "dl"}, {"category": "rag"}],
)

# Query
results = collection.query(
    query_texts=["machine learning tutorial"],
    n_results=3,
    where={"category": "ml"},  # metadata filtering
    include=["documents", "distances", "metadatas"],
)

for doc, dist, meta in zip(
    results["documents"][0],
    results["distances"][0],
    results["metadatas"][0]
):
    print(f"{1 - dist:.3f}: [{meta['category']}] {doc[:80]}")

Hybrid Search: Best of Both Worlds

Pure semantic search sometimes misses exact keyword matches. Hybrid search combines BM25 (keyword) + vector (semantic):

Python
from rank_bm25 import BM25Okapi
import numpy as np


class HybridSearchEngine:
    def __init__(self, semantic_engine: SemanticSearchEngine):
        self.semantic = semantic_engine
        self.bm25 = None

    def build_bm25(self):
        """Build BM25 index from document texts."""
        tokenized = [doc.text.lower().split() for doc in self.semantic.documents]
        self.bm25 = BM25Okapi(tokenized)

    def search(self, query: str, top_k: int = 5, alpha: float = 0.7) -> list[dict]:
        """
        alpha: weight for semantic (1 - alpha for BM25)
        alpha=1.0 = pure semantic, alpha=0.0 = pure keyword
        """
        n = len(self.semantic.documents)

        # Semantic scores
        sem_results = self.semantic.search(query, top_k=n, threshold=0)
        sem_scores = np.zeros(n)
        for r in sem_results:
            idx = self.semantic.documents.index(r["document"])
            sem_scores[idx] = r["score"]

        # BM25 scores (normalize to [0, 1])
        bm25_scores = np.array(self.bm25.get_scores(query.lower().split()))
        if bm25_scores.max() > 0:
            bm25_scores = bm25_scores / bm25_scores.max()

        # Combine
        combined = alpha * sem_scores + (1 - alpha) * bm25_scores
        top_indices = np.argsort(combined)[::-1][:top_k]

        return [
            {"document": self.semantic.documents[i], "score": round(float(combined[i]), 4)}
            for i in top_indices if combined[i] > 0
        ]

When to use hybrid:

  • Technical documentation (needs exact term matching)
  • Code search (identifiers must match exactly)
  • Medical/legal (precise terminology matters)

When pure semantic is fine:

  • General knowledge Q&A
  • Customer support
  • FAQ matching

What to Learn Next

MC
Mamta Chauhan
Content Creator and AI Enthusiast

Mamta Chauhan is an AI enthusiast and content creator behind ailearnings.in. She writes practical guides on LLMs, RAG, and AI engineering to help developers navigate the fast-moving world of artificial intelligence. Passionate about bridging the gap between cutting-edge research and real-world application.

← Back to all articles