Neue RAG-Demo mit Beispieldateien hinzugefügt
RAG mit LanceDB
This commit is contained in:
73
RAG-Demo.py
Normal file
73
RAG-Demo.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import lancedb
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
import openai
|
||||||
|
|
||||||
|
# Setze deinen OpenAI API-Schlüssel
|
||||||
|
openai.api_key = 'DEIN_OPENAI_API_KEY'
|
||||||
|
|
||||||
|
def extract_text_from_pdf(file_path):
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
text = ""
|
||||||
|
for page in reader.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
text += page_text
|
||||||
|
return text
|
||||||
|
|
||||||
|
def get_embedding(text):
|
||||||
|
# OpenAI Embeddings API verwenden
|
||||||
|
response = openai.Embedding.create(
|
||||||
|
input=[text],
|
||||||
|
model="text-embedding-ada-002"
|
||||||
|
)
|
||||||
|
embedding = response['data'][0]['embedding']
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
def load_pdfs_to_lancedb(directory, db_path):
|
||||||
|
# Verbindung zur LanceDB herstellen
|
||||||
|
db = lancedb.connect(db_path)
|
||||||
|
table_name = 'pdf_embeddings'
|
||||||
|
data = []
|
||||||
|
# Über alle PDFs im Verzeichnis iterieren
|
||||||
|
for pdf_file in glob.glob(os.path.join(directory, "*.pdf")):
|
||||||
|
text = extract_text_from_pdf(pdf_file)
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
embedding = get_embedding(text)
|
||||||
|
# Daten zum Einfügen vorbereiten
|
||||||
|
data.append({
|
||||||
|
'vector': embedding,
|
||||||
|
'file_link': pdf_file,
|
||||||
|
'text': text # Optional, könnte hilfreich sein
|
||||||
|
})
|
||||||
|
# Tabelle erstellen oder öffnen
|
||||||
|
if table_name in db.table_names():
|
||||||
|
table = db.open_table(table_name)
|
||||||
|
table.add(data)
|
||||||
|
else:
|
||||||
|
table = db.create_table(table_name, data=data, mode='overwrite')
|
||||||
|
|
||||||
|
def semantic_search(query, db_path, top_k=5):
|
||||||
|
db = lancedb.connect(db_path)
|
||||||
|
table = db.open_table('pdf_embeddings')
|
||||||
|
query_embedding = get_embedding(query)
|
||||||
|
# Suche in der Tabelle durchführen
|
||||||
|
results = table.search(query_embedding).limit(top_k).to_df()
|
||||||
|
# Dateilinks aus den Ergebnissen extrahieren
|
||||||
|
file_links = results['file_link'].tolist()
|
||||||
|
return file_links
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Ersetze durch deine tatsächlichen Pfade
|
||||||
|
pdf_directory = 'pfad_zum_pdf_verzeichnis'
|
||||||
|
lancedb_path = 'pfad_zum_lancedb_verzeichnis'
|
||||||
|
# PDFs in LanceDB laden
|
||||||
|
load_pdfs_to_lancedb(pdf_directory, lancedb_path)
|
||||||
|
# Semantische Suche durchführen
|
||||||
|
query = input("Gib deine Suchanfrage ein: ")
|
||||||
|
results = semantic_search(query, lancedb_path)
|
||||||
|
print("Passende Dateien:")
|
||||||
|
for file_link in results:
|
||||||
|
print(file_link)
|
||||||
BIN
dateien/Demodateien/augentropfen.docx
Normal file
BIN
dateien/Demodateien/augentropfen.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/augentropfen.pdf
Normal file
BIN
dateien/Demodateien/augentropfen.pdf
Normal file
Binary file not shown.
BIN
dateien/Demodateien/lutschtabletten.docx
Normal file
BIN
dateien/Demodateien/lutschtabletten.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/lutschtabletten.pdf
Normal file
BIN
dateien/Demodateien/lutschtabletten.pdf
Normal file
Binary file not shown.
BIN
dateien/Demodateien/nasenspray.docx
Normal file
BIN
dateien/Demodateien/nasenspray.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/nasenspray.pdf
Normal file
BIN
dateien/Demodateien/nasenspray.pdf
Normal file
Binary file not shown.
BIN
dateien/Demodateien/wundsalbe.docx
Normal file
BIN
dateien/Demodateien/wundsalbe.docx
Normal file
Binary file not shown.
BIN
dateien/Demodateien/wundsalbe.pdf
Normal file
BIN
dateien/Demodateien/wundsalbe.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user