Neue RAG-Demo mit Beispieldateien hinzugefügt

RAG mit LanceDB
This commit is contained in:
2024-12-04 16:19:48 +01:00
parent d8e0131e8b
commit d2a4cef77b
9 changed files with 73 additions and 0 deletions

73
RAG-Demo.py Normal file
View File

@@ -0,0 +1,73 @@
import os
import glob
import lancedb
from PyPDF2 import PdfReader
import openai
# Setze deinen OpenAI API-Schlüssel
openai.api_key = 'DEIN_OPENAI_API_KEY'
def extract_text_from_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def get_embedding(text):
# OpenAI Embeddings API verwenden
response = openai.Embedding.create(
input=[text],
model="text-embedding-ada-002"
)
embedding = response['data'][0]['embedding']
return embedding
def load_pdfs_to_lancedb(directory, db_path):
# Verbindung zur LanceDB herstellen
db = lancedb.connect(db_path)
table_name = 'pdf_embeddings'
data = []
# Über alle PDFs im Verzeichnis iterieren
for pdf_file in glob.glob(os.path.join(directory, "*.pdf")):
text = extract_text_from_pdf(pdf_file)
if not text:
continue
embedding = get_embedding(text)
# Daten zum Einfügen vorbereiten
data.append({
'vector': embedding,
'file_link': pdf_file,
'text': text # Optional, könnte hilfreich sein
})
# Tabelle erstellen oder öffnen
if table_name in db.table_names():
table = db.open_table(table_name)
table.add(data)
else:
table = db.create_table(table_name, data=data, mode='overwrite')
def semantic_search(query, db_path, top_k=5):
db = lancedb.connect(db_path)
table = db.open_table('pdf_embeddings')
query_embedding = get_embedding(query)
# Suche in der Tabelle durchführen
results = table.search(query_embedding).limit(top_k).to_df()
# Dateilinks aus den Ergebnissen extrahieren
file_links = results['file_link'].tolist()
return file_links
if __name__ == "__main__":
# Ersetze durch deine tatsächlichen Pfade
pdf_directory = 'pfad_zum_pdf_verzeichnis'
lancedb_path = 'pfad_zum_lancedb_verzeichnis'
# PDFs in LanceDB laden
load_pdfs_to_lancedb(pdf_directory, lancedb_path)
# Semantische Suche durchführen
query = input("Gib deine Suchanfrage ein: ")
results = semantic_search(query, lancedb_path)
print("Passende Dateien:")
for file_link in results:
print(file_link)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.