Neue RAG-Demo mit Beispieldateien hinzugefügt
RAG mit LanceDB
This commit is contained in:
73
RAG-Demo.py
Normal file
73
RAG-Demo.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import os
|
||||
import glob
|
||||
import lancedb
|
||||
from PyPDF2 import PdfReader
|
||||
import openai
|
||||
|
||||
# Setze deinen OpenAI API-Schlüssel
|
||||
openai.api_key = 'DEIN_OPENAI_API_KEY'
|
||||
|
||||
def extract_text_from_pdf(file_path):
|
||||
reader = PdfReader(file_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text += page_text
|
||||
return text
|
||||
|
||||
def get_embedding(text):
|
||||
# OpenAI Embeddings API verwenden
|
||||
response = openai.Embedding.create(
|
||||
input=[text],
|
||||
model="text-embedding-ada-002"
|
||||
)
|
||||
embedding = response['data'][0]['embedding']
|
||||
return embedding
|
||||
|
||||
def load_pdfs_to_lancedb(directory, db_path):
|
||||
# Verbindung zur LanceDB herstellen
|
||||
db = lancedb.connect(db_path)
|
||||
table_name = 'pdf_embeddings'
|
||||
data = []
|
||||
# Über alle PDFs im Verzeichnis iterieren
|
||||
for pdf_file in glob.glob(os.path.join(directory, "*.pdf")):
|
||||
text = extract_text_from_pdf(pdf_file)
|
||||
if not text:
|
||||
continue
|
||||
embedding = get_embedding(text)
|
||||
# Daten zum Einfügen vorbereiten
|
||||
data.append({
|
||||
'vector': embedding,
|
||||
'file_link': pdf_file,
|
||||
'text': text # Optional, könnte hilfreich sein
|
||||
})
|
||||
# Tabelle erstellen oder öffnen
|
||||
if table_name in db.table_names():
|
||||
table = db.open_table(table_name)
|
||||
table.add(data)
|
||||
else:
|
||||
table = db.create_table(table_name, data=data, mode='overwrite')
|
||||
|
||||
def semantic_search(query, db_path, top_k=5):
|
||||
db = lancedb.connect(db_path)
|
||||
table = db.open_table('pdf_embeddings')
|
||||
query_embedding = get_embedding(query)
|
||||
# Suche in der Tabelle durchführen
|
||||
results = table.search(query_embedding).limit(top_k).to_df()
|
||||
# Dateilinks aus den Ergebnissen extrahieren
|
||||
file_links = results['file_link'].tolist()
|
||||
return file_links
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ersetze durch deine tatsächlichen Pfade
|
||||
pdf_directory = 'pfad_zum_pdf_verzeichnis'
|
||||
lancedb_path = 'pfad_zum_lancedb_verzeichnis'
|
||||
# PDFs in LanceDB laden
|
||||
load_pdfs_to_lancedb(pdf_directory, lancedb_path)
|
||||
# Semantische Suche durchführen
|
||||
query = input("Gib deine Suchanfrage ein: ")
|
||||
results = semantic_search(query, lancedb_path)
|
||||
print("Passende Dateien:")
|
||||
for file_link in results:
|
||||
print(file_link)
|
||||
BIN
dateien/Demodateien/augentropfen.docx
Normal file
BIN
dateien/Demodateien/augentropfen.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/augentropfen.pdf
Normal file
BIN
dateien/Demodateien/augentropfen.pdf
Normal file
Binary file not shown.
BIN
dateien/Demodateien/lutschtabletten.docx
Normal file
BIN
dateien/Demodateien/lutschtabletten.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/lutschtabletten.pdf
Normal file
BIN
dateien/Demodateien/lutschtabletten.pdf
Normal file
Binary file not shown.
BIN
dateien/Demodateien/nasenspray.docx
Normal file
BIN
dateien/Demodateien/nasenspray.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/nasenspray.pdf
Normal file
BIN
dateien/Demodateien/nasenspray.pdf
Normal file
Binary file not shown.
BIN
dateien/Demodateien/wundsalbe.docx
Normal file
BIN
dateien/Demodateien/wundsalbe.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/wundsalbe.pdf
Normal file
BIN
dateien/Demodateien/wundsalbe.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user