Sesja 10: Systemy wyszukiwania semantycznego i wektorowego

Azure Cognitive Search i nowoczesne techniki wyszukiwania

🎯 Cele sesji

  • Implementacja Azure Cognitive Search z semantic capabilities
  • Tworzenie systemów wyszukiwania wektorowego
  • Integracja wyszukiwania hybrydowego (text + vector)
  • Budowa inteligentnej bazy wiedzy

🔍 Wyszukiwanie semantyczne vs tradycyjne

Ewolucja wyszukiwania

Tradycyjne wyszukiwanie (keyword-based):

Query: "apple fruit nutrition"
Results: Dokumenty zawierające DOKŁADNIE te słowa
Problem: Nie znajduje "czerwone jabłko witaminy" (synonimy)

Wyszukiwanie semantyczne:

Query: "apple fruit nutrition"  
Results: Dokumenty o jabłkach, owocach, wartościach odżywczych
Advantage: Rozumie ZNACZENIE, nie tylko słowa

Wyszukiwanie wektorowe:

Query: [0.2, -0.8, 0.1, ...] (embedding vector)
Results: Dokumenty o podobnej semantyce
Power: Koncepcyjnie powiązane treści

Modele embedding dla semantic search

from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import faiss

class SemanticSearchEngine:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(model_name)
        self.document_store = {}
        self.vector_index = None
        self.document_embeddings = None
        
        print(f"🧠 Loaded embedding model: {model_name}")
        print(f"📐 Embedding dimension: {self.embedding_model.get_sentence_embedding_dimension()}")
    
    def index_documents(self, documents: List[Dict]):
        """Indeksowanie dokumentów dla wyszukiwania semantycznego"""
        
        print(f"📚 Indexing {len(documents)} documents...")
        
        # Przygotowanie tekstów do embedding
        texts = []
        doc_ids = []
        
        for doc in documents:
            doc_id = doc["id"]
            # Kombinuj tytuł i treść dla lepszego embedding
            combined_text = f"{doc.get('title', '')} {doc.get('content', '')}"
            
            texts.append(combined_text)
            doc_ids.append(doc_id)
            
            # Zapisz dokument w store
            self.document_store[doc_id] = doc
        
        # Generowanie embeddingów
        print("🔄 Generating embeddings...")
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        self.document_embeddings = embeddings
        
        # Tworzenie FAISS index dla szybkiego wyszukiwania
        print("🗂️ Building vector index...")
        dimension = embeddings.shape[1]
        
        # Użyj HNSW index dla lepszej wydajności
        self.vector_index = faiss.IndexHNSWFlat(dimension, 32)
        self.vector_index.hnsw.efConstruction = 64
        self.vector_index.hnsw.efSearch = 64
        
        # Dodaj embeddingi do indexu
        self.vector_index.add(embeddings.astype('float32'))
        
        print(f"✅ Indexed {len(documents)} documents")
        
        return {
            "indexed_documents": len(documents),
            "embedding_dimension": dimension,
            "index_type": "HNSW"
        }
    
    def semantic_search(self, query: str, top_k: int = 10, similarity_threshold: float = 0.3) -> List[Dict]:
        """Wyszukiwanie semantyczne"""
        
        if self.vector_index is None:
            raise ValueError("No documents indexed. Call index_documents first.")
        
        # Generowanie embedding dla zapytania
        query_embedding = self.embedding_model.encode([query])
        
        # Wyszukiwanie podobnych dokumentów
        similarities, indices = self.vector_index.search(
            query_embedding.astype('float32'), 
            top_k
        )
        
        results = []
        
        for i, (similarity, doc_index) in enumerate(zip(similarities[0], indices[0])):
            # FAISS returns distances, convert to similarity scores
            similarity_score = 1.0 / (1.0 + similarity)  # Simple conversion
            
            if similarity_score >= similarity_threshold:
                # Pobierz dokument z store
                doc_id = list(self.document_store.keys())[doc_index]
                document = self.document_store[doc_id]
                
                results.append({
                    "document_id": doc_id,
                    "title": document.get("title", ""),
                    "content_preview": document.get("content", "")[:200] + "...",
                    "similarity_score": float(similarity_score),
                    "rank": i + 1,
                    "metadata": document.get("metadata", {})
                })
        
        return results
    
    def hybrid_search(self, query: str, keywords: List[str] = None, top_k: int = 10) -> List[Dict]:
        """Wyszukiwanie hybrydowe: semantyczne + keyword"""
        
        # Wyszukiwanie semantyczne
        semantic_results = self.semantic_search(query, top_k * 2)  # Pobierz więcej kandydatów
        
        # Keyword filtering jeśli podane
        if keywords:
            filtered_results = []
            for result in semantic_results:
                content = result["content_preview"].lower()
                title = result["title"].lower()
                
                # Sprawdź czy zawiera przynajmniej jedno keyword
                if any(keyword.lower() in content or keyword.lower() in title for keyword in keywords):
                    # Boost score dla keyword matches
                    keyword_matches = sum(1 for kw in keywords if kw.lower() in content or kw.lower() in title)
                    result["similarity_score"] *= (1 + keyword_matches * 0.1)  # 10% boost per match
                    result["keyword_matches"] = keyword_matches
                    filtered_results.append(result)
            
            # Re-sort by boosted scores
            filtered_results.sort(key=lambda x: x["similarity_score"], reverse=True)
            semantic_results = filtered_results
        
        # Zwróć top_k wyników
        return semantic_results[:top_k]
    
    def explain_search_results(self, query: str, results: List[Dict]) -> Dict:
        """Wyjaśnienie dlaczego dokumenty zostały znalezione"""
        
        query_embedding = self.embedding_model.encode([query])
        
        explanations = []
        
        for result in results[:5]:  # Explain top 5
            doc_id = result["document_id"]
            doc = self.document_store[doc_id]
            
            # Znajdź najlepiej dopasowane fragmenty
            doc_text = f"{doc.get('title', '')} {doc.get('content', '')}"
            
            # Podziel na zdania i znajdź najbardziej podobne
            sentences = doc_text.split('.')
            sentence_embeddings = self.embedding_model.encode(sentences)
            
            # Podobieństwa między query a zdaniami
            sentence_similarities = np.dot(query_embedding, sentence_embeddings.T).flatten()
            best_sentence_idx = np.argmax(sentence_similarities)
            
            explanations.append({
                "document_id": doc_id,
                "title": doc.get("title", ""),
                "most_relevant_sentence": sentences[best_sentence_idx].strip(),
                "relevance_score": float(sentence_similarities[best_sentence_idx]),
                "explanation": f"This document matched because of the sentence: '{sentences[best_sentence_idx].strip()[:100]}...'"
            })
        
        return {
            "query": query,
            "explanations": explanations,
            "search_strategy": "semantic_similarity_with_sentence_analysis"
        }

🏗️ Azure Cognitive Search Implementation

Konfiguracja usługi wyszukiwania

from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
import openai

class AzureCognitiveSearchManager:
    def __init__(self, search_endpoint, admin_key):
        self.search_endpoint = search_endpoint
        self.admin_key = admin_key
        
        self.index_client = SearchIndexClient(
            endpoint=search_endpoint,
            credential=AzureKeyCredential(admin_key)
        )
        
    def create_semantic_search_index(self, index_name="intelligent-documents"):
        """Tworzenie indeksu z semantic search capabilities"""
        
        fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True),
            SearchableField(name="title", type=SearchFieldDataType.String, analyzer_name="pl.microsoft"),
            SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="pl.microsoft"),
            SearchableField(name="summary", type=SearchFieldDataType.String),
            SimpleField(name="category", type=SearchFieldDataType.String, filterable=True, facetable=True),
            SimpleField(name="created_date", type=SearchFieldDataType.DateTimeOffset, sortable=True, filterable=True),
            SimpleField(name="author", type=SearchFieldDataType.String, filterable=True, facetable=True),
            
            # Vector field dla semantic search
            SearchableField(
                name="contentVector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=1536,  # OpenAI embeddings
                vector_search_profile_name="myHnswProfile"
            )
        ]
        
        # Vector search configuration
        vector_search = VectorSearch(
            profiles=[VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw"
            )],
            algorithms=[HnswAlgorithmConfiguration(
                name="myHnsw",
                parameters=HnswParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE,
                    m=4,
                    ef_construction=400,
                    ef_search=500
                )
            )]
        )
        
        # Semantic search configuration
        semantic_search = SemanticSearch(
            configurations=[SemanticConfiguration(
                name="default",
                prioritized_fields=SemanticPrioritizedFields(
                    title_field=SemanticField(field_name="title"),
                    content_fields=[
                        SemanticField(field_name="content"),
                        SemanticField(field_name="summary")
                    ]
                )
            )]
        )
        
        # Tworzenie indeksu
        index = SearchIndex(
            name=index_name,
            fields=fields,
            vector_search=vector_search,
            semantic_search=semantic_search
        )
        
        try:
            self.index_client.create_or_update_index(index)
            print(f"✅ Search index '{index_name}' created successfully")
            return index
        except Exception as e:
            print(f"❌ Failed to create index: {str(e)}")
            raise
    
    async def index_documents_with_vectors(self, documents: List[Dict], index_name: str):
        """Indeksowanie dokumentów z wektorami semantycznymi"""
        
        search_client = SearchClient(
            endpoint=self.search_endpoint,
            index_name=index_name,
            credential=AzureKeyCredential(self.admin_key)
        )
        
        print(f"📄 Processing {len(documents)} documents for indexing...")
        
        # Przygotuj dokumenty z embeddingami
        indexed_documents = []
        
        for doc in documents:
            # Generuj embedding dla contentu
            content_text = f"{doc.get('title', '')} {doc.get('content', '')}"
            
            # Używamy OpenAI dla embeddingów (zgodnie z Azure OpenAI)
            embedding_response = openai.Embedding.create(
                input=content_text,
                model="text-embedding-ada-002"
            )
            content_vector = embedding_response["data"][0]["embedding"]
            
            # Przygotuj dokument dla indeksu
            search_document = {
                "id": doc["id"],
                "title": doc.get("title", ""),
                "content": doc.get("content", ""),
                "summary": doc.get("summary", ""),
                "category": doc.get("category", "general"),
                "created_date": doc.get("created_date", datetime.utcnow().isoformat()),
                "author": doc.get("author", "unknown"),
                "contentVector": content_vector
            }
            
            indexed_documents.append(search_document)
        
        # Batch upload do Azure Search
        try:
            result = search_client.upload_documents(indexed_documents)
            
            successful = len([r for r in result if r.succeeded])
            failed = len(result) - successful
            
            print(f"✅ Indexed {successful} documents successfully")
            if failed > 0:
                print(f"❌ Failed to index {failed} documents")
            
            return {
                "total_submitted": len(documents),
                "successful": successful,
                "failed": failed,
                "index_name": index_name
            }
            
        except Exception as e:
            print(f"❌ Batch indexing failed: {str(e)}")
            raise
    
    async def semantic_search_query(self, query: str, index_name: str, 
                                  search_options: Dict = None) -> Dict:
        """Wykonanie semantic search query"""
        
        search_client = SearchClient(
            endpoint=self.search_endpoint,
            index_name=index_name,
            credential=AzureKeyCredential(self.admin_key)
        )
        
        options = search_options or {}
        
        # Generuj embedding dla query
        query_embedding_response = openai.Embedding.create(
            input=query,
            model="text-embedding-ada-002"
        )
        query_vector = query_embedding_response["data"][0]["embedding"]
        
        # Wykonaj hybrid search (semantic + vector)
        search_results = search_client.search(
            search_text=query,
            vector_queries=[VectorizedQuery(
                vector=query_vector,
                k_nearest_neighbors=options.get("k", 10),
                fields="contentVector"
            )],
            query_type=QueryType.SEMANTIC,
            semantic_configuration_name="default",
            query_caption=QueryCaptionType.EXTRACTIVE,
            query_answer=QueryAnswerType.EXTRACTIVE,
            filter=options.get("filter"),
            order_by=options.get("order_by"),
            top=options.get("top", 10),
            include_total_count=True
        )
        
        # Formatowanie wyników
        formatted_results = []
        
        for result in search_results:
            formatted_result = {
                "document_id": result["id"],
                "title": result["title"],
                "content_preview": result["content"][:300] + "...",
                "category": result["category"],
                "author": result["author"],
                "relevance_score": result["@search.score"],
                "semantic_captions": result.get("@search.captions", []),
                "semantic_answers": result.get("@search.answers", []),
                "highlights": result.get("@search.highlights", {})
            }
            formatted_results.append(formatted_result)
        
        return {
            "query": query,
            "total_results": search_results.get_count() or len(formatted_results),
            "results": formatted_results,
            "search_type": "hybrid_semantic_vector"
        }

✅ Zadania praktyczne

Zadanie 1: Basic Semantic Search (45 min)

  1. Skonfiguruj Azure Cognitive Search service
  2. Stwórz semantic search index
  3. Zaindeksuj przykładową kolekcję dokumentów (20+ docs)
  4. Przetestuj różne typy zapytań

Zadanie 2: Vector Search Implementation (30 min)

  1. Zaimplementuj pure vector search z FAISS
  2. Porównaj wyniki z traditional keyword search
  3. Przetestuj z synonymami i koncepcjami pokrewnymi
  4. Oceń jakość wyników

Zadanie 3: Hybrid Search System (30 min)

  1. Połącz semantic + vector + keyword search
  2. Zaimplementuj relevance boosting
  3. Dodaj filtry i faceted search
  4. Stwórz explanation mechanism

Zadanie 4: Search Analytics (15 min)

  1. Zaimplementuj query logging
  2. Stwórz dashboard z search metrics
  3. Dodaj click-through tracking
  4. Przygotuj search optimization report

📊 Evaluation metrics

Metryki jakości wyszukiwania

  • Precision@5 > 80% dla głównych zapytań
  • Recall@10 > 90% dla known relevant documents
  • Mean Average Precision > 0.7
  • Search latency < 200ms

A/B Testing różnych konfiguracji

class SearchExperimentFramework:
    def __init__(self):
        self.experiments = {}
        self.baselines = {}
    
    def create_search_experiment(self, experiment_name: str, 
                               configurations: List[Dict]):
        """Stwórz eksperyment A/B dla różnych konfiguracji search"""
        
        self.experiments[experiment_name] = {
            "configurations": configurations,
            "results": [],
            "test_queries": [],
            "metrics": {}
        }
        
        print(f"🧪 Created experiment: {experiment_name}")
        
        return experiment_name
    
    async def run_search_experiment(self, experiment_name: str, 
                                  test_queries: List[str]):
        """Uruchom eksperyment wyszukiwania"""
        
        experiment = self.experiments[experiment_name]
        results_by_config = {}
        
        # Test każdej konfiguracji
        for config in experiment["configurations"]:
            config_name = config["name"]
            results_by_config[config_name] = []
            
            print(f"🔬 Testing configuration: {config_name}")
            
            for query in test_queries:
                # Wykonaj search z tą konfiguracją
                search_results = await self._execute_search_with_config(query, config)
                
                results_by_config[config_name].append({
                    "query": query,
                    "results": search_results,
                    "metrics": self._calculate_query_metrics(search_results)
                })
        
        # Analiza wyników
        experiment_summary = self._analyze_experiment_results(results_by_config)
        
        self.experiments[experiment_name]["results"] = results_by_config
        self.experiments[experiment_name]["summary"] = experiment_summary
        
        return experiment_summary

🏆 Rezultat sesji

Po ukończeniu uczestnicy będą mieli:

  1. Działający semantic search - Azure Cognitive Search implementation
  2. Vector search system - FAISS-based implementation
  3. Hybrid search - kombinacja różnych technik
  4. Evaluation framework - narzędzia do pomiaru jakości

📚 Materiały dodatkowe

💡 Wskazówka

Każda sesja to 2 godziny intensywnej nauki z praktycznymi ćwiczeniami. Materiały można przeglądać w dowolnym tempie.

📈 Postęp

Śledź swój postęp w nauce AI i przygotowaniu do certyfikacji Azure AI-102. Każdy moduł buduje na poprzednim.