nicht funktionierende version

This commit is contained in:
2024-12-04 18:21:55 +01:00
parent d2a4cef77b
commit f905f5649c
3 changed files with 40 additions and 12 deletions

View File

@@ -3,9 +3,12 @@ import glob
import lancedb import lancedb
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
import openai import openai
import tkinter as tk
from tkinter import filedialog
from config import OPENAI_API_KEY
# Setze deinen OpenAI API-Schlüssel # Setze deinen OpenAI API-Schlüssel
openai.api_key = 'DEIN_OPENAI_API_KEY' openai.api_key = OPENAI_API_KEY
def extract_text_from_pdf(file_path): def extract_text_from_pdf(file_path):
reader = PdfReader(file_path) reader = PdfReader(file_path)
@@ -19,14 +22,14 @@ def extract_text_from_pdf(file_path):
def get_embedding(text): def get_embedding(text):
# OpenAI Embeddings API verwenden # OpenAI Embeddings API verwenden
response = openai.Embedding.create( response = openai.Embedding.create(
input=[text], input=text,
model="text-embedding-ada-002" model="text-embedding-ada-002"
) )
embedding = response['data'][0]['embedding'] embedding = response['data'][0]['embedding']
return embedding return embedding
def load_pdfs_to_lancedb(directory, db_path): def load_pdfs_to_lancedb(directory, db_path):
# Verbindung zur LanceDB herstellen # Verbindung zur LanceDB herstellen (synchroner Client)
db = lancedb.connect(db_path) db = lancedb.connect(db_path)
table_name = 'pdf_embeddings' table_name = 'pdf_embeddings'
data = [] data = []
@@ -47,7 +50,7 @@ def load_pdfs_to_lancedb(directory, db_path):
table = db.open_table(table_name) table = db.open_table(table_name)
table.add(data) table.add(data)
else: else:
table = db.create_table(table_name, data=data, mode='overwrite') table = db.create_table(table_name, data=data)
def semantic_search(query, db_path, top_k=5): def semantic_search(query, db_path, top_k=5):
db = lancedb.connect(db_path) db = lancedb.connect(db_path)
@@ -60,13 +63,36 @@ def semantic_search(query, db_path, top_k=5):
return file_links return file_links
if __name__ == "__main__": if __name__ == "__main__":
# Ersetze durch deine tatsächlichen Pfade import sys
pdf_directory = 'pfad_zum_pdf_verzeichnis' import argparse
lancedb_path = 'pfad_zum_lancedb_verzeichnis'
# Argument Parser für Kommandozeilenargumente
parser = argparse.ArgumentParser(description='PDF-Semantische Suche mit LanceDB')
parser.add_argument('--db_dir', type=str, help='Pfad zum LanceDB-Verzeichnis', default='lancedb_data')
args = parser.parse_args()
lancedb_path = args.db_dir
# Tkinter GUI initialisieren
root = tk.Tk()
root.withdraw() # Hauptfenster ausblenden
# Ordnerauswahl-Dialog öffnen
pdf_directory = filedialog.askdirectory(title="Bitte wählen Sie das PDF-Verzeichnis aus")
# Überprüfen, ob das Verzeichnis existiert
if not pdf_directory or not os.path.isdir(pdf_directory):
print("Kein gültiges Verzeichnis ausgewählt. Das Programm wird beendet.")
sys.exit(1)
# PDFs in LanceDB laden # PDFs in LanceDB laden
load_pdfs_to_lancedb(pdf_directory, lancedb_path) load_pdfs_to_lancedb(pdf_directory, lancedb_path)
# Semantische Suche durchführen # Semantische Suche durchführen
query = input("Gib deine Suchanfrage ein: ") while True:
query = input("Gib deine Suchanfrage ein (oder 'exit' zum Beenden): ")
if query.lower() == 'exit':
break
results = semantic_search(query, lancedb_path) results = semantic_search(query, lancedb_path)
print("Passende Dateien:") print("Passende Dateien:")
for file_link in results: for file_link in results:

Binary file not shown.

2
config.py Normal file
View File

@@ -0,0 +1,2 @@
# OpenAI API Key
OPENAI_API_KEY = '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'