Sesja 10: Systemy wyszukiwania semantycznego i wektorowego
Azure Cognitive Search i nowoczesne techniki wyszukiwania
🎯 Cele sesji
- Implementacja Azure Cognitive Search z semantic capabilities
- Tworzenie systemów wyszukiwania wektorowego
- Integracja wyszukiwania hybrydowego (text + vector)
- Budowa inteligentnej bazy wiedzy
🔍 Wyszukiwanie semantyczne vs tradycyjne
Ewolucja wyszukiwania
Tradycyjne wyszukiwanie (keyword-based):
Query: "apple fruit nutrition"
Results: Dokumenty zawierające DOKŁADNIE te słowa
Problem: Nie znajduje "czerwone jabłko witaminy" (synonimy)
Wyszukiwanie semantyczne:
Query: "apple fruit nutrition"
Results: Dokumenty o jabłkach, owocach, wartościach odżywczych
Advantage: Rozumie ZNACZENIE, nie tylko słowa
Wyszukiwanie wektorowe:
Query: [0.2, -0.8, 0.1, ...] (embedding vector)
Results: Dokumenty o podobnej semantyce
Power: Koncepcyjnie powiązane treści
Modele embedding dla semantic search
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import faiss
class SemanticSearchEngine:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.embedding_model = SentenceTransformer(model_name)
self.document_store = {}
self.vector_index = None
self.document_embeddings = None
print(f"🧠 Loaded embedding model: {model_name}")
print(f"📐 Embedding dimension: {self.embedding_model.get_sentence_embedding_dimension()}")
def index_documents(self, documents: List[Dict]):
"""Indeksowanie dokumentów dla wyszukiwania semantycznego"""
print(f"📚 Indexing {len(documents)} documents...")
# Przygotowanie tekstów do embedding
texts = []
doc_ids = []
for doc in documents:
doc_id = doc["id"]
# Kombinuj tytuł i treść dla lepszego embedding
combined_text = f"{doc.get('title', '')} {doc.get('content', '')}"
texts.append(combined_text)
doc_ids.append(doc_id)
# Zapisz dokument w store
self.document_store[doc_id] = doc
# Generowanie embeddingów
print("🔄 Generating embeddings...")
embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
self.document_embeddings = embeddings
# Tworzenie FAISS index dla szybkiego wyszukiwania
print("🗂️ Building vector index...")
dimension = embeddings.shape[1]
# Użyj HNSW index dla lepszej wydajności
self.vector_index = faiss.IndexHNSWFlat(dimension, 32)
self.vector_index.hnsw.efConstruction = 64
self.vector_index.hnsw.efSearch = 64
# Dodaj embeddingi do indexu
self.vector_index.add(embeddings.astype('float32'))
print(f"✅ Indexed {len(documents)} documents")
return {
"indexed_documents": len(documents),
"embedding_dimension": dimension,
"index_type": "HNSW"
}
def semantic_search(self, query: str, top_k: int = 10, similarity_threshold: float = 0.3) -> List[Dict]:
"""Wyszukiwanie semantyczne"""
if self.vector_index is None:
raise ValueError("No documents indexed. Call index_documents first.")
# Generowanie embedding dla zapytania
query_embedding = self.embedding_model.encode([query])
# Wyszukiwanie podobnych dokumentów
similarities, indices = self.vector_index.search(
query_embedding.astype('float32'),
top_k
)
results = []
for i, (similarity, doc_index) in enumerate(zip(similarities[0], indices[0])):
# FAISS returns distances, convert to similarity scores
similarity_score = 1.0 / (1.0 + similarity) # Simple conversion
if similarity_score >= similarity_threshold:
# Pobierz dokument z store
doc_id = list(self.document_store.keys())[doc_index]
document = self.document_store[doc_id]
results.append({
"document_id": doc_id,
"title": document.get("title", ""),
"content_preview": document.get("content", "")[:200] + "...",
"similarity_score": float(similarity_score),
"rank": i + 1,
"metadata": document.get("metadata", {})
})
return results
def hybrid_search(self, query: str, keywords: List[str] = None, top_k: int = 10) -> List[Dict]:
"""Wyszukiwanie hybrydowe: semantyczne + keyword"""
# Wyszukiwanie semantyczne
semantic_results = self.semantic_search(query, top_k * 2) # Pobierz więcej kandydatów
# Keyword filtering jeśli podane
if keywords:
filtered_results = []
for result in semantic_results:
content = result["content_preview"].lower()
title = result["title"].lower()
# Sprawdź czy zawiera przynajmniej jedno keyword
if any(keyword.lower() in content or keyword.lower() in title for keyword in keywords):
# Boost score dla keyword matches
keyword_matches = sum(1 for kw in keywords if kw.lower() in content or kw.lower() in title)
result["similarity_score"] *= (1 + keyword_matches * 0.1) # 10% boost per match
result["keyword_matches"] = keyword_matches
filtered_results.append(result)
# Re-sort by boosted scores
filtered_results.sort(key=lambda x: x["similarity_score"], reverse=True)
semantic_results = filtered_results
# Zwróć top_k wyników
return semantic_results[:top_k]
def explain_search_results(self, query: str, results: List[Dict]) -> Dict:
"""Wyjaśnienie dlaczego dokumenty zostały znalezione"""
query_embedding = self.embedding_model.encode([query])
explanations = []
for result in results[:5]: # Explain top 5
doc_id = result["document_id"]
doc = self.document_store[doc_id]
# Znajdź najlepiej dopasowane fragmenty
doc_text = f"{doc.get('title', '')} {doc.get('content', '')}"
# Podziel na zdania i znajdź najbardziej podobne
sentences = doc_text.split('.')
sentence_embeddings = self.embedding_model.encode(sentences)
# Podobieństwa między query a zdaniami
sentence_similarities = np.dot(query_embedding, sentence_embeddings.T).flatten()
best_sentence_idx = np.argmax(sentence_similarities)
explanations.append({
"document_id": doc_id,
"title": doc.get("title", ""),
"most_relevant_sentence": sentences[best_sentence_idx].strip(),
"relevance_score": float(sentence_similarities[best_sentence_idx]),
"explanation": f"This document matched because of the sentence: '{sentences[best_sentence_idx].strip()[:100]}...'"
})
return {
"query": query,
"explanations": explanations,
"search_strategy": "semantic_similarity_with_sentence_analysis"
}
🏗️ Azure Cognitive Search Implementation
Konfiguracja usługi wyszukiwania
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
import openai
class AzureCognitiveSearchManager:
def __init__(self, search_endpoint, admin_key):
self.search_endpoint = search_endpoint
self.admin_key = admin_key
self.index_client = SearchIndexClient(
endpoint=search_endpoint,
credential=AzureKeyCredential(admin_key)
)
def create_semantic_search_index(self, index_name="intelligent-documents"):
"""Tworzenie indeksu z semantic search capabilities"""
fields = [
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
SearchableField(name="title", type=SearchFieldDataType.String, analyzer_name="pl.microsoft"),
SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="pl.microsoft"),
SearchableField(name="summary", type=SearchFieldDataType.String),
SimpleField(name="category", type=SearchFieldDataType.String, filterable=True, facetable=True),
SimpleField(name="created_date", type=SearchFieldDataType.DateTimeOffset, sortable=True, filterable=True),
SimpleField(name="author", type=SearchFieldDataType.String, filterable=True, facetable=True),
# Vector field dla semantic search
SearchableField(
name="contentVector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=1536, # OpenAI embeddings
vector_search_profile_name="myHnswProfile"
)
]
# Vector search configuration
vector_search = VectorSearch(
profiles=[VectorSearchProfile(
name="myHnswProfile",
algorithm_configuration_name="myHnsw"
)],
algorithms=[HnswAlgorithmConfiguration(
name="myHnsw",
parameters=HnswParameters(
metric=VectorSearchAlgorithmMetric.COSINE,
m=4,
ef_construction=400,
ef_search=500
)
)]
)
# Semantic search configuration
semantic_search = SemanticSearch(
configurations=[SemanticConfiguration(
name="default",
prioritized_fields=SemanticPrioritizedFields(
title_field=SemanticField(field_name="title"),
content_fields=[
SemanticField(field_name="content"),
SemanticField(field_name="summary")
]
)
)]
)
# Tworzenie indeksu
index = SearchIndex(
name=index_name,
fields=fields,
vector_search=vector_search,
semantic_search=semantic_search
)
try:
self.index_client.create_or_update_index(index)
print(f"✅ Search index '{index_name}' created successfully")
return index
except Exception as e:
print(f"❌ Failed to create index: {str(e)}")
raise
async def index_documents_with_vectors(self, documents: List[Dict], index_name: str):
"""Indeksowanie dokumentów z wektorami semantycznymi"""
search_client = SearchClient(
endpoint=self.search_endpoint,
index_name=index_name,
credential=AzureKeyCredential(self.admin_key)
)
print(f"📄 Processing {len(documents)} documents for indexing...")
# Przygotuj dokumenty z embeddingami
indexed_documents = []
for doc in documents:
# Generuj embedding dla contentu
content_text = f"{doc.get('title', '')} {doc.get('content', '')}"
# Używamy OpenAI dla embeddingów (zgodnie z Azure OpenAI)
embedding_response = openai.Embedding.create(
input=content_text,
model="text-embedding-ada-002"
)
content_vector = embedding_response["data"][0]["embedding"]
# Przygotuj dokument dla indeksu
search_document = {
"id": doc["id"],
"title": doc.get("title", ""),
"content": doc.get("content", ""),
"summary": doc.get("summary", ""),
"category": doc.get("category", "general"),
"created_date": doc.get("created_date", datetime.utcnow().isoformat()),
"author": doc.get("author", "unknown"),
"contentVector": content_vector
}
indexed_documents.append(search_document)
# Batch upload do Azure Search
try:
result = search_client.upload_documents(indexed_documents)
successful = len([r for r in result if r.succeeded])
failed = len(result) - successful
print(f"✅ Indexed {successful} documents successfully")
if failed > 0:
print(f"❌ Failed to index {failed} documents")
return {
"total_submitted": len(documents),
"successful": successful,
"failed": failed,
"index_name": index_name
}
except Exception as e:
print(f"❌ Batch indexing failed: {str(e)}")
raise
async def semantic_search_query(self, query: str, index_name: str,
search_options: Dict = None) -> Dict:
"""Wykonanie semantic search query"""
search_client = SearchClient(
endpoint=self.search_endpoint,
index_name=index_name,
credential=AzureKeyCredential(self.admin_key)
)
options = search_options or {}
# Generuj embedding dla query
query_embedding_response = openai.Embedding.create(
input=query,
model="text-embedding-ada-002"
)
query_vector = query_embedding_response["data"][0]["embedding"]
# Wykonaj hybrid search (semantic + vector)
search_results = search_client.search(
search_text=query,
vector_queries=[VectorizedQuery(
vector=query_vector,
k_nearest_neighbors=options.get("k", 10),
fields="contentVector"
)],
query_type=QueryType.SEMANTIC,
semantic_configuration_name="default",
query_caption=QueryCaptionType.EXTRACTIVE,
query_answer=QueryAnswerType.EXTRACTIVE,
filter=options.get("filter"),
order_by=options.get("order_by"),
top=options.get("top", 10),
include_total_count=True
)
# Formatowanie wyników
formatted_results = []
for result in search_results:
formatted_result = {
"document_id": result["id"],
"title": result["title"],
"content_preview": result["content"][:300] + "...",
"category": result["category"],
"author": result["author"],
"relevance_score": result["@search.score"],
"semantic_captions": result.get("@search.captions", []),
"semantic_answers": result.get("@search.answers", []),
"highlights": result.get("@search.highlights", {})
}
formatted_results.append(formatted_result)
return {
"query": query,
"total_results": search_results.get_count() or len(formatted_results),
"results": formatted_results,
"search_type": "hybrid_semantic_vector"
}
✅ Zadania praktyczne
Zadanie 1: Basic Semantic Search (45 min)
- Skonfiguruj Azure Cognitive Search service
- Stwórz semantic search index
- Zaindeksuj przykładową kolekcję dokumentów (20+ docs)
- Przetestuj różne typy zapytań
Zadanie 2: Vector Search Implementation (30 min)
- Zaimplementuj pure vector search z FAISS
- Porównaj wyniki z traditional keyword search
- Przetestuj z synonymami i koncepcjami pokrewnymi
- Oceń jakość wyników
Zadanie 3: Hybrid Search System (30 min)
- Połącz semantic + vector + keyword search
- Zaimplementuj relevance boosting
- Dodaj filtry i faceted search
- Stwórz explanation mechanism
Zadanie 4: Search Analytics (15 min)
- Zaimplementuj query logging
- Stwórz dashboard z search metrics
- Dodaj click-through tracking
- Przygotuj search optimization report
📊 Evaluation metrics
Metryki jakości wyszukiwania
- Precision@5 > 80% dla głównych zapytań
- Recall@10 > 90% dla known relevant documents
- Mean Average Precision > 0.7
- Search latency < 200ms
A/B Testing różnych konfiguracji
class SearchExperimentFramework:
def __init__(self):
self.experiments = {}
self.baselines = {}
def create_search_experiment(self, experiment_name: str,
configurations: List[Dict]):
"""Stwórz eksperyment A/B dla różnych konfiguracji search"""
self.experiments[experiment_name] = {
"configurations": configurations,
"results": [],
"test_queries": [],
"metrics": {}
}
print(f"🧪 Created experiment: {experiment_name}")
return experiment_name
async def run_search_experiment(self, experiment_name: str,
test_queries: List[str]):
"""Uruchom eksperyment wyszukiwania"""
experiment = self.experiments[experiment_name]
results_by_config = {}
# Test każdej konfiguracji
for config in experiment["configurations"]:
config_name = config["name"]
results_by_config[config_name] = []
print(f"🔬 Testing configuration: {config_name}")
for query in test_queries:
# Wykonaj search z tą konfiguracją
search_results = await self._execute_search_with_config(query, config)
results_by_config[config_name].append({
"query": query,
"results": search_results,
"metrics": self._calculate_query_metrics(search_results)
})
# Analiza wyników
experiment_summary = self._analyze_experiment_results(results_by_config)
self.experiments[experiment_name]["results"] = results_by_config
self.experiments[experiment_name]["summary"] = experiment_summary
return experiment_summary
🏆 Rezultat sesji
Po ukończeniu uczestnicy będą mieli:
- Działający semantic search - Azure Cognitive Search implementation
- Vector search system - FAISS-based implementation
- Hybrid search - kombinacja różnych technik
- Evaluation framework - narzędzia do pomiaru jakości