datenbasis aus test mitr vision-Modell

This commit is contained in:
2024-12-04 18:34:43 +01:00
parent f905f5649c
commit 9e395f5332

View File

@@ -1,99 +1,123 @@
import os """
import glob File: llamaVisionApp.py
import lancedb Author: Martin Rattensberger
from PyPDF2 import PdfReader Description: A GUI application for interacting with a local Llama vision model.
import openai Users can upload images or PDFs and ask questions about them.
Date: 11.11.2024 # Replace with actual date
Version: 1.0
Development Environment: Visual Studio Code with Continue.ai (Claude Sonnet 3.5)
This script creates a tkinter-based GUI for uploading images or PDFs,
sending them to a local Llama 3.2 vision model, and displaying the results.
"""
import tkinter as tk import tkinter as tk
from tkinter import filedialog from tkinter import filedialog, scrolledtext
from config import OPENAI_API_KEY import ollama
from PIL import Image
import fitz # PyMuPDF library for handling PDFs
import io
import base64
import threading
import time
# Setze deinen OpenAI API-Schlüssel class LlamaVisionApp:
openai.api_key = OPENAI_API_KEY def __init__(self, master):
self.master = master
master.title("Llama Vision Interface")
def extract_text_from_pdf(file_path): # File upload button
reader = PdfReader(file_path) self.upload_button = tk.Button(master, text="Upload File", command=self.upload_file)
text = "" self.upload_button.pack(pady=10)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def get_embedding(text): # Display selected filename
# OpenAI Embeddings API verwenden self.filename_label = tk.Label(master, text="No file selected")
response = openai.Embedding.create( self.filename_label.pack()
input=text,
model="text-embedding-ada-002" # Question input - now larger
self.question_entry = tk.Text(master, width=50, height=3) # Changed from Entry to Text
self.question_entry.pack(pady=10)
self.question_entry.insert(tk.END, "What is in this image?")
# Submit button
self.submit_button = tk.Button(master, text="Submit", command=self.submit_question)
self.submit_button.pack()
# Response display
self.response_text = scrolledtext.ScrolledText(master, width=60, height=30)
self.response_text.pack(pady=10)
self.file_path = None
self.image_data = None
self.processing = False
def upload_file(self):
self.file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.png *.jpg *.jpeg *.gif"), ("PDF files", "*.pdf")])
if self.file_path:
self.filename_label.config(text=f"Selected file: {self.file_path}")
self.load_file()
def load_file(self):
if self.file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
with open(self.file_path, "rb") as image_file:
self.image_data = base64.b64encode(image_file.read()).decode('utf-8')
elif self.file_path.lower().endswith('.pdf'):
pdf_document = fitz.open(self.file_path)
first_page = pdf_document[0]
image = first_page.get_pixmap()
img = Image.frombytes("RGB", [image.width, image.height], image.samples)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
self.image_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
pdf_document.close()
def submit_question(self):
if not self.image_data:
self.response_text.delete('1.0', tk.END) # Clear previous response
self.response_text.insert(tk.END, "Please upload an image or PDF first.\n")
return
question = self.question_entry.get('1.0', tk.END).strip() # Get text from Text widget
# Clear previous response
self.response_text.delete('1.0', tk.END)
# Start processing animation
self.processing = True
threading.Thread(target=self.processing_animation).start()
# Run the Llama model in a separate thread
threading.Thread(target=self.run_llama_model, args=(question,)).start()
def processing_animation(self):
animation = "|/-\\"
i = 0
while self.processing:
self.response_text.delete('1.0', tk.END)
self.response_text.insert(tk.END, f"Processing {animation[i % len(animation)]}")
self.master.update_idletasks()
time.sleep(0.1)
i += 1
def run_llama_model(self, question):
try:
response = ollama.chat(
model='llama3.2-vision',
messages=[{
'role': 'user',
'content': question,
'images': [self.image_data]
}]
) )
embedding = response['data'][0]['embedding'] self.processing = False
return embedding self.master.after(0, self.update_response, question, response['message']['content'])
except Exception as e:
self.processing = False
self.master.after(0, self.update_response, question, f"Error: {str(e)}")
def load_pdfs_to_lancedb(directory, db_path): def update_response(self, question, answer):
# Verbindung zur LanceDB herstellen (synchroner Client) self.response_text.delete('1.0', tk.END)
db = lancedb.connect(db_path) self.response_text.insert(tk.END, f"Q: {question}\nA: {answer}\n\n")
table_name = 'pdf_embeddings'
data = []
# Über alle PDFs im Verzeichnis iterieren
for pdf_file in glob.glob(os.path.join(directory, "*.pdf")):
text = extract_text_from_pdf(pdf_file)
if not text:
continue
embedding = get_embedding(text)
# Daten zum Einfügen vorbereiten
data.append({
'vector': embedding,
'file_link': pdf_file,
'text': text # Optional, könnte hilfreich sein
})
# Tabelle erstellen oder öffnen
if table_name in db.table_names():
table = db.open_table(table_name)
table.add(data)
else:
table = db.create_table(table_name, data=data)
def semantic_search(query, db_path, top_k=5): root = tk.Tk()
db = lancedb.connect(db_path) app = LlamaVisionApp(root)
table = db.open_table('pdf_embeddings') root.mainloop()
query_embedding = get_embedding(query)
# Suche in der Tabelle durchführen
results = table.search(query_embedding).limit(top_k).to_df()
# Dateilinks aus den Ergebnissen extrahieren
file_links = results['file_link'].tolist()
return file_links
if __name__ == "__main__":
import sys
import argparse
# Argument Parser für Kommandozeilenargumente
parser = argparse.ArgumentParser(description='PDF-Semantische Suche mit LanceDB')
parser.add_argument('--db_dir', type=str, help='Pfad zum LanceDB-Verzeichnis', default='lancedb_data')
args = parser.parse_args()
lancedb_path = args.db_dir
# Tkinter GUI initialisieren
root = tk.Tk()
root.withdraw() # Hauptfenster ausblenden
# Ordnerauswahl-Dialog öffnen
pdf_directory = filedialog.askdirectory(title="Bitte wählen Sie das PDF-Verzeichnis aus")
# Überprüfen, ob das Verzeichnis existiert
if not pdf_directory or not os.path.isdir(pdf_directory):
print("Kein gültiges Verzeichnis ausgewählt. Das Programm wird beendet.")
sys.exit(1)
# PDFs in LanceDB laden
load_pdfs_to_lancedb(pdf_directory, lancedb_path)
# Semantische Suche durchführen
while True:
query = input("Gib deine Suchanfrage ein (oder 'exit' zum Beenden): ")
if query.lower() == 'exit':
break
results = semantic_search(query, lancedb_path)
print("Passende Dateien:")
for file_link in results:
print(file_link)