From 9e395f533205b8900296eb536268b924bbf70d8f Mon Sep 17 00:00:00 2001 From: Martin Rattensberger Date: Wed, 4 Dec 2024 18:34:43 +0100 Subject: [PATCH] datenbasis aus test mitr vision-Modell --- RAG-Demo.py | 194 +++++++++++++++++++++++++++++----------------------- 1 file changed, 109 insertions(+), 85 deletions(-) diff --git a/RAG-Demo.py b/RAG-Demo.py index f2e016b..1581a49 100644 --- a/RAG-Demo.py +++ b/RAG-Demo.py @@ -1,99 +1,123 @@ -import os -import glob -import lancedb -from PyPDF2 import PdfReader -import openai +""" +File: llamaVisionApp.py +Author: Martin Rattensberger +Description: A GUI application for interacting with a local Llama vision model. + Users can upload images or PDFs and ask questions about them. +Date: 11.11.2024 # Replace with actual date +Version: 1.0 +Development Environment: Visual Studio Code with Continue.ai (Claude Sonnet 3.5) + +This script creates a tkinter-based GUI for uploading images or PDFs, +sending them to a local Llama 3.2 vision model, and displaying the results. +""" + import tkinter as tk -from tkinter import filedialog -from config import OPENAI_API_KEY +from tkinter import filedialog, scrolledtext +import ollama +from PIL import Image +import fitz # PyMuPDF library for handling PDFs +import io +import base64 +import threading +import time -# Setze deinen OpenAI API-Schlüssel -openai.api_key = OPENAI_API_KEY +class LlamaVisionApp: + def __init__(self, master): + self.master = master + master.title("Llama Vision Interface") -def extract_text_from_pdf(file_path): - reader = PdfReader(file_path) - text = "" - for page in reader.pages: - page_text = page.extract_text() - if page_text: - text += page_text - return text + # File upload button + self.upload_button = tk.Button(master, text="Upload File", command=self.upload_file) + self.upload_button.pack(pady=10) -def get_embedding(text): - # OpenAI Embeddings API verwenden - response = openai.Embedding.create( - input=text, - model="text-embedding-ada-002" - ) - embedding = response['data'][0]['embedding'] - return embedding + # Display selected filename + self.filename_label = tk.Label(master, text="No file selected") + self.filename_label.pack() -def load_pdfs_to_lancedb(directory, db_path): - # Verbindung zur LanceDB herstellen (synchroner Client) - db = lancedb.connect(db_path) - table_name = 'pdf_embeddings' - data = [] - # Über alle PDFs im Verzeichnis iterieren - for pdf_file in glob.glob(os.path.join(directory, "*.pdf")): - text = extract_text_from_pdf(pdf_file) - if not text: - continue - embedding = get_embedding(text) - # Daten zum Einfügen vorbereiten - data.append({ - 'vector': embedding, - 'file_link': pdf_file, - 'text': text # Optional, könnte hilfreich sein - }) - # Tabelle erstellen oder öffnen - if table_name in db.table_names(): - table = db.open_table(table_name) - table.add(data) - else: - table = db.create_table(table_name, data=data) + # Question input - now larger + self.question_entry = tk.Text(master, width=50, height=3) # Changed from Entry to Text + self.question_entry.pack(pady=10) + self.question_entry.insert(tk.END, "What is in this image?") -def semantic_search(query, db_path, top_k=5): - db = lancedb.connect(db_path) - table = db.open_table('pdf_embeddings') - query_embedding = get_embedding(query) - # Suche in der Tabelle durchführen - results = table.search(query_embedding).limit(top_k).to_df() - # Dateilinks aus den Ergebnissen extrahieren - file_links = results['file_link'].tolist() - return file_links + # Submit button + self.submit_button = tk.Button(master, text="Submit", command=self.submit_question) + self.submit_button.pack() -if __name__ == "__main__": - import sys - import argparse + # Response display + self.response_text = scrolledtext.ScrolledText(master, width=60, height=30) + self.response_text.pack(pady=10) - # Argument Parser für Kommandozeilenargumente - parser = argparse.ArgumentParser(description='PDF-Semantische Suche mit LanceDB') - parser.add_argument('--db_dir', type=str, help='Pfad zum LanceDB-Verzeichnis', default='lancedb_data') - args = parser.parse_args() + self.file_path = None + self.image_data = None + self.processing = False - lancedb_path = args.db_dir + def upload_file(self): + self.file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.png *.jpg *.jpeg *.gif"), ("PDF files", "*.pdf")]) + if self.file_path: + self.filename_label.config(text=f"Selected file: {self.file_path}") + self.load_file() - # Tkinter GUI initialisieren - root = tk.Tk() - root.withdraw() # Hauptfenster ausblenden + def load_file(self): + if self.file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): + with open(self.file_path, "rb") as image_file: + self.image_data = base64.b64encode(image_file.read()).decode('utf-8') + elif self.file_path.lower().endswith('.pdf'): + pdf_document = fitz.open(self.file_path) + first_page = pdf_document[0] + image = first_page.get_pixmap() + img = Image.frombytes("RGB", [image.width, image.height], image.samples) + buffer = io.BytesIO() + img.save(buffer, format="PNG") + self.image_data = base64.b64encode(buffer.getvalue()).decode('utf-8') + pdf_document.close() - # Ordnerauswahl-Dialog öffnen - pdf_directory = filedialog.askdirectory(title="Bitte wählen Sie das PDF-Verzeichnis aus") + def submit_question(self): + if not self.image_data: + self.response_text.delete('1.0', tk.END) # Clear previous response + self.response_text.insert(tk.END, "Please upload an image or PDF first.\n") + return - # Überprüfen, ob das Verzeichnis existiert - if not pdf_directory or not os.path.isdir(pdf_directory): - print("Kein gültiges Verzeichnis ausgewählt. Das Programm wird beendet.") - sys.exit(1) + question = self.question_entry.get('1.0', tk.END).strip() # Get text from Text widget + # Clear previous response + self.response_text.delete('1.0', tk.END) + + # Start processing animation + self.processing = True + threading.Thread(target=self.processing_animation).start() - # PDFs in LanceDB laden - load_pdfs_to_lancedb(pdf_directory, lancedb_path) + # Run the Llama model in a separate thread + threading.Thread(target=self.run_llama_model, args=(question,)).start() - # Semantische Suche durchführen - while True: - query = input("Gib deine Suchanfrage ein (oder 'exit' zum Beenden): ") - if query.lower() == 'exit': - break - results = semantic_search(query, lancedb_path) - print("Passende Dateien:") - for file_link in results: - print(file_link) + def processing_animation(self): + animation = "|/-\\" + i = 0 + while self.processing: + self.response_text.delete('1.0', tk.END) + self.response_text.insert(tk.END, f"Processing {animation[i % len(animation)]}") + self.master.update_idletasks() + time.sleep(0.1) + i += 1 + + def run_llama_model(self, question): + try: + response = ollama.chat( + model='llama3.2-vision', + messages=[{ + 'role': 'user', + 'content': question, + 'images': [self.image_data] + }] + ) + self.processing = False + self.master.after(0, self.update_response, question, response['message']['content']) + except Exception as e: + self.processing = False + self.master.after(0, self.update_response, question, f"Error: {str(e)}") + + def update_response(self, question, answer): + self.response_text.delete('1.0', tk.END) + self.response_text.insert(tk.END, f"Q: {question}\nA: {answer}\n\n") + +root = tk.Tk() +app = LlamaVisionApp(root) +root.mainloop()