datenbasis aus test mitr vision-Modell

This commit is contained in:
2024-12-04 18:34:43 +01:00
parent f905f5649c
commit 9e395f5332

View File

@@ -1,99 +1,123 @@
import os """
import glob File: llamaVisionApp.py
import lancedb Author: Martin Rattensberger
from PyPDF2 import PdfReader Description: A GUI application for interacting with a local Llama vision model.
import openai Users can upload images or PDFs and ask questions about them.
Date: 11.11.2024 # Replace with actual date
Version: 1.0
Development Environment: Visual Studio Code with Continue.ai (Claude Sonnet 3.5)
This script creates a tkinter-based GUI for uploading images or PDFs,
sending them to a local Llama 3.2 vision model, and displaying the results.
"""
import tkinter as tk import tkinter as tk
from tkinter import filedialog from tkinter import filedialog, scrolledtext
from config import OPENAI_API_KEY import ollama
from PIL import Image
import fitz # PyMuPDF library for handling PDFs
import io
import base64
import threading
import time
# Setze deinen OpenAI API-Schlüssel class LlamaVisionApp:
openai.api_key = OPENAI_API_KEY def __init__(self, master):
self.master = master
master.title("Llama Vision Interface")
def extract_text_from_pdf(file_path): # File upload button
reader = PdfReader(file_path) self.upload_button = tk.Button(master, text="Upload File", command=self.upload_file)
text = "" self.upload_button.pack(pady=10)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def get_embedding(text): # Display selected filename
# OpenAI Embeddings API verwenden self.filename_label = tk.Label(master, text="No file selected")
response = openai.Embedding.create( self.filename_label.pack()
input=text,
model="text-embedding-ada-002"
)
embedding = response['data'][0]['embedding']
return embedding
def load_pdfs_to_lancedb(directory, db_path): # Question input - now larger
# Verbindung zur LanceDB herstellen (synchroner Client) self.question_entry = tk.Text(master, width=50, height=3) # Changed from Entry to Text
db = lancedb.connect(db_path) self.question_entry.pack(pady=10)
table_name = 'pdf_embeddings' self.question_entry.insert(tk.END, "What is in this image?")
data = []
# Über alle PDFs im Verzeichnis iterieren
for pdf_file in glob.glob(os.path.join(directory, "*.pdf")):
text = extract_text_from_pdf(pdf_file)
if not text:
continue
embedding = get_embedding(text)
# Daten zum Einfügen vorbereiten
data.append({
'vector': embedding,
'file_link': pdf_file,
'text': text # Optional, könnte hilfreich sein
})
# Tabelle erstellen oder öffnen
if table_name in db.table_names():
table = db.open_table(table_name)
table.add(data)
else:
table = db.create_table(table_name, data=data)
def semantic_search(query, db_path, top_k=5): # Submit button
db = lancedb.connect(db_path) self.submit_button = tk.Button(master, text="Submit", command=self.submit_question)
table = db.open_table('pdf_embeddings') self.submit_button.pack()
query_embedding = get_embedding(query)
# Suche in der Tabelle durchführen
results = table.search(query_embedding).limit(top_k).to_df()
# Dateilinks aus den Ergebnissen extrahieren
file_links = results['file_link'].tolist()
return file_links
if __name__ == "__main__": # Response display
import sys self.response_text = scrolledtext.ScrolledText(master, width=60, height=30)
import argparse self.response_text.pack(pady=10)
# Argument Parser für Kommandozeilenargumente self.file_path = None
parser = argparse.ArgumentParser(description='PDF-Semantische Suche mit LanceDB') self.image_data = None
parser.add_argument('--db_dir', type=str, help='Pfad zum LanceDB-Verzeichnis', default='lancedb_data') self.processing = False
args = parser.parse_args()
lancedb_path = args.db_dir def upload_file(self):
self.file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.png *.jpg *.jpeg *.gif"), ("PDF files", "*.pdf")])
if self.file_path:
self.filename_label.config(text=f"Selected file: {self.file_path}")
self.load_file()
# Tkinter GUI initialisieren def load_file(self):
root = tk.Tk() if self.file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
root.withdraw() # Hauptfenster ausblenden with open(self.file_path, "rb") as image_file:
self.image_data = base64.b64encode(image_file.read()).decode('utf-8')
elif self.file_path.lower().endswith('.pdf'):
pdf_document = fitz.open(self.file_path)
first_page = pdf_document[0]
image = first_page.get_pixmap()
img = Image.frombytes("RGB", [image.width, image.height], image.samples)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
self.image_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
pdf_document.close()
# Ordnerauswahl-Dialog öffnen def submit_question(self):
pdf_directory = filedialog.askdirectory(title="Bitte wählen Sie das PDF-Verzeichnis aus") if not self.image_data:
self.response_text.delete('1.0', tk.END) # Clear previous response
self.response_text.insert(tk.END, "Please upload an image or PDF first.\n")
return
# Überprüfen, ob das Verzeichnis existiert question = self.question_entry.get('1.0', tk.END).strip() # Get text from Text widget
if not pdf_directory or not os.path.isdir(pdf_directory): # Clear previous response
print("Kein gültiges Verzeichnis ausgewählt. Das Programm wird beendet.") self.response_text.delete('1.0', tk.END)
sys.exit(1)
# PDFs in LanceDB laden # Start processing animation
load_pdfs_to_lancedb(pdf_directory, lancedb_path) self.processing = True
threading.Thread(target=self.processing_animation).start()
# Semantische Suche durchführen # Run the Llama model in a separate thread
while True: threading.Thread(target=self.run_llama_model, args=(question,)).start()
query = input("Gib deine Suchanfrage ein (oder 'exit' zum Beenden): ")
if query.lower() == 'exit': def processing_animation(self):
break animation = "|/-\\"
results = semantic_search(query, lancedb_path) i = 0
print("Passende Dateien:") while self.processing:
for file_link in results: self.response_text.delete('1.0', tk.END)
print(file_link) self.response_text.insert(tk.END, f"Processing {animation[i % len(animation)]}")
self.master.update_idletasks()
time.sleep(0.1)
i += 1
def run_llama_model(self, question):
try:
response = ollama.chat(
model='llama3.2-vision',
messages=[{
'role': 'user',
'content': question,
'images': [self.image_data]
}]
)
self.processing = False
self.master.after(0, self.update_response, question, response['message']['content'])
except Exception as e:
self.processing = False
self.master.after(0, self.update_response, question, f"Error: {str(e)}")
def update_response(self, question, answer):
self.response_text.delete('1.0', tk.END)
self.response_text.insert(tk.END, f"Q: {question}\nA: {answer}\n\n")
root = tk.Tk()
app = LlamaVisionApp(root)
root.mainloop()