FAISS Setup & Configuration
FAISS (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors. Let's install and configure it for our multimodal search system.
🏗️ FAISS vs Alternatives Comparison
🔍 FAISS
✅ Advantages
- Extremely fast (sub-millisecond search)
- Free and open source
- Runs locally - full data control
- Multiple index types for different needs
- GPU acceleration support
❌ Trade-offs
- Requires more setup and configuration
- Manual scaling and management
- Need to handle persistence yourself
🌲 Pinecone
✅ Advantages
- Fully managed service
- Auto-scaling and backups
- Real-time updates
- Built-in metadata filtering
❌ Trade-offs
- Expensive ($70/month+)
- Network latency overhead
- Vendor lock-in
- Data privacy concerns
🎯 Our Choice: FAISS
For this tutorial, we're using FAISS because it's free, extremely fast, and perfect for learning the fundamentals of vector search.
Step 1: Install FAISS
# In your Docker container, install FAISS
# For CPU-only version
pip install faiss-cpu
# For GPU version (if you have CUDA)
pip install faiss-gpu
# Also install additional dependencies
pip install numpy scikit-learn matplotlib
Step 2: Create Vector Storage Service
# Create vector_service.py
import faiss
import numpy as np
import pickle
import os
import time
import logging
from typing import List, Dict, Any, Optional, Tuple
from pathlib import Path
import json
from dataclasses import dataclass, asdict
from config import settings
logger = logging.getLogger(__name__)
@dataclass
class VectorItem:
"""Represents a vector with metadata"""
id: str
vector: List[float]
metadata: Dict[str, Any]
modality: str
timestamp: float
@dataclass
class SearchResult:
"""Represents a search result"""
id: str
score: float
metadata: Dict[str, Any]
modality: str
class FAISSVectorStore:
"""FAISS-based vector storage and search"""
def __init__(self,
dimension: int = 1024,
index_type: str = "IVF",
storage_dir: str = "vector_storage"):
"""
Initialize FAISS vector store
Args:
dimension: Vector dimension (1024 for ImageBind)
index_type: FAISS index type ('Flat', 'IVF', 'HNSW')
storage_dir: Directory to store index and metadata
"""
self.dimension = dimension
self.index_type = index_type
self.storage_dir = Path(storage_dir)
self.storage_dir.mkdir(exist_ok=True)
# FAISS index
self.index = None
self.is_trained = False
# Metadata storage
self.id_to_metadata = {} # id -> metadata
self.index_to_id = {} # faiss_index -> id
self.id_to_index = {} # id -> faiss_index
self.next_index = 0
# Index configuration
self.index_config = {
'Flat': {'description': 'Exact search, best accuracy'},
'IVF': {'description': 'Inverted file index, good speed/accuracy balance'},
'HNSW': {'description': 'Hierarchical NSW, fastest search'}
}
self._initialize_index()
def _initialize_index(self):
"""Initialize the FAISS index based on type"""
try:
if self.index_type == "Flat":
# Exact search index - guarantees best results but slower for large datasets
self.index = faiss.IndexFlatIP(self.dimension) # Inner Product (cosine similarity)
self.is_trained = True
logger.info("Initialized Flat index for exact search")
elif self.index_type == "IVF":
# Inverted File index - good balance of speed and accuracy
nlist = 100 # number of clusters
quantizer = faiss.IndexFlatIP(self.dimension)
self.index = faiss.IndexIVFFlat(quantizer, self.dimension, nlist)
logger.info(f"Initialized IVF index with {nlist} clusters")
elif self.index_type == "HNSW":
# Hierarchical Navigable Small World - fastest search
M = 16 # number of connections
self.index = faiss.IndexHNSWFlat(self.dimension, M)
self.index.hnsw.efConstruction = 200
self.index.hnsw.efSearch = 50
self.is_trained = True
logger.info(f"Initialized HNSW index with M={M}")
else:
raise ValueError(f"Unsupported index type: {self.index_type}")
except Exception as e:
logger.error(f"Failed to initialize FAISS index: {e}")
raise
def add_vector(self, vector_item: VectorItem) -> bool:
"""Add a vector to the index"""
try:
# Validate vector
if len(vector_item.vector) != self.dimension:
raise ValueError(f"Vector dimension {len(vector_item.vector)} != {self.dimension}")
# Check if ID already exists
if vector_item.id in self.id_to_index:
logger.warning(f"Vector ID {vector_item.id} already exists, skipping")
return False
# Convert to numpy array and normalize
vector = np.array(vector_item.vector, dtype=np.float32).reshape(1, -1)
vector = vector / np.linalg.norm(vector, axis=1, keepdims=True) # L2 normalize
# Train index if needed (for IVF)
if not self.is_trained and self.index_type == "IVF":
if self.next_index >= 256: # Need at least 256 vectors to train IVF
self._train_index()
# Add to FAISS index if trained
if self.is_trained:
self.index.add(vector)
# Update mappings
faiss_index = self.next_index
self.index_to_id[faiss_index] = vector_item.id
self.id_to_index[vector_item.id] = faiss_index
self.next_index += 1
# Store metadata
self.id_to_metadata[vector_item.id] = {
'metadata': vector_item.metadata,
'modality': vector_item.modality,
'timestamp': vector_item.timestamp,
'vector': vector_item.vector # Keep original vector for training
}
logger.debug(f"Added vector {vector_item.id} (index: {self.next_index-1 if self.is_trained else 'pending'})")
return True
except Exception as e:
logger.error(f"Failed to add vector {vector_item.id}: {e}")
return False
def _train_index(self):
"""Train the index (for IVF type)"""
try:
if self.index_type != "IVF" or self.is_trained:
return
# Collect all vectors for training
training_vectors = []
for item_data in self.id_to_metadata.values():
vector = np.array(item_data['vector'], dtype=np.float32)
vector = vector / np.linalg.norm(vector) # L2 normalize
training_vectors.append(vector)
if len(training_vectors) < 256:
logger.warning(f"Not enough vectors for training: {len(training_vectors)} < 256")
return
training_data = np.vstack(training_vectors)
logger.info(f"Training IVF index with {len(training_vectors)} vectors...")
self.index.train(training_data)
self.is_trained = True
# Add all vectors to the trained index
self.index.add(training_data)
# Update index mappings
for i, (item_id, _) in enumerate(self.id_to_metadata.items()):
self.index_to_id[i] = item_id
self.id_to_index[item_id] = i
self.next_index = len(training_vectors)
logger.info("IVF index training completed")
except Exception as e:
logger.error(f"Failed to train index: {e}")
raise
Step 3: Configuration Settings
# Add to config.py
class VectorSearchSettings:
# FAISS Configuration
vector_dimension: int = 1024
index_type: str = "IVF" # "Flat", "IVF", "HNSW"
storage_directory: str = "vector_storage"
# Search Configuration
default_search_k: int = 10
max_search_k: int = 100
similarity_threshold: float = 0.0
# Performance Configuration
ivf_nlist: int = 100 # Number of clusters for IVF
hnsw_m: int = 16 # Connections for HNSW
hnsw_ef_construction: int = 200
hnsw_ef_search: int = 50
# Storage Configuration
auto_save_interval: int = 100 # Save every N additions
backup_enabled: bool = True
compression_enabled: bool = True
# Update main settings
class Settings(BaseSettings):
# ... existing settings ...
# Vector Search Settings
vector_search: VectorSearchSettings = VectorSearchSettings()
# Update global settings
settings = Settings()
🔧 Index Type Selection Guide
- • Flat: Use for <100K vectors, guarantees exact results
- • IVF: Use for 100K-10M vectors, good speed/accuracy balance
- • HNSW: Use for >1M vectors, fastest search but uses more memory
- • Training: IVF requires 256+ vectors for training
- • Memory: Flat and IVF use less memory than HNSW