import os import glob import lancedb from PyPDF2 import PdfReader import openai # Setze deinen OpenAI API-Schlüssel openai.api_key = 'DEIN_OPENAI_API_KEY' def extract_text_from_pdf(file_path): reader = PdfReader(file_path) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text return text def get_embedding(text): # OpenAI Embeddings API verwenden response = openai.Embedding.create( input=[text], model="text-embedding-ada-002" ) embedding = response['data'][0]['embedding'] return embedding def load_pdfs_to_lancedb(directory, db_path): # Verbindung zur LanceDB herstellen db = lancedb.connect(db_path) table_name = 'pdf_embeddings' data = [] # Über alle PDFs im Verzeichnis iterieren for pdf_file in glob.glob(os.path.join(directory, "*.pdf")): text = extract_text_from_pdf(pdf_file) if not text: continue embedding = get_embedding(text) # Daten zum Einfügen vorbereiten data.append({ 'vector': embedding, 'file_link': pdf_file, 'text': text # Optional, könnte hilfreich sein }) # Tabelle erstellen oder öffnen if table_name in db.table_names(): table = db.open_table(table_name) table.add(data) else: table = db.create_table(table_name, data=data, mode='overwrite') def semantic_search(query, db_path, top_k=5): db = lancedb.connect(db_path) table = db.open_table('pdf_embeddings') query_embedding = get_embedding(query) # Suche in der Tabelle durchführen results = table.search(query_embedding).limit(top_k).to_df() # Dateilinks aus den Ergebnissen extrahieren file_links = results['file_link'].tolist() return file_links if __name__ == "__main__": # Ersetze durch deine tatsächlichen Pfade pdf_directory = 'pfad_zum_pdf_verzeichnis' lancedb_path = 'pfad_zum_lancedb_verzeichnis' # PDFs in LanceDB laden load_pdfs_to_lancedb(pdf_directory, lancedb_path) # Semantische Suche durchführen query = input("Gib deine Suchanfrage ein: ") results = semantic_search(query, lancedb_path) print("Passende Dateien:") for file_link in results: print(file_link)