""" File: llamaVisionApp.py Author: Martin Rattensberger Description: A GUI application for interacting with a local Llama vision model. Users can upload images or PDFs and ask questions about them. Date: 11.11.2024 # Replace with actual date Version: 1.0 Development Environment: Visual Studio Code with Continue.ai (Claude Sonnet 3.5) This script creates a tkinter-based GUI for uploading images or PDFs, sending them to a local Llama 3.2 vision model, and displaying the results. """ import tkinter as tk from tkinter import filedialog, scrolledtext import ollama from PIL import Image import fitz # PyMuPDF library for handling PDFs import io import base64 import threading import time class LlamaVisionApp: def __init__(self, master): self.master = master master.title("Llama Vision Interface") # File upload button self.upload_button = tk.Button(master, text="Upload File", command=self.upload_file) self.upload_button.pack(pady=10) # Display selected filename self.filename_label = tk.Label(master, text="No file selected") self.filename_label.pack() # Question input - now larger self.question_entry = tk.Text(master, width=50, height=3) # Changed from Entry to Text self.question_entry.pack(pady=10) self.question_entry.insert(tk.END, "What is in this image?") # Submit button self.submit_button = tk.Button(master, text="Submit", command=self.submit_question) self.submit_button.pack() # Response display self.response_text = scrolledtext.ScrolledText(master, width=60, height=30) self.response_text.pack(pady=10) self.file_path = None self.image_data = None self.processing = False def upload_file(self): self.file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.png *.jpg *.jpeg *.gif"), ("PDF files", "*.pdf")]) if self.file_path: self.filename_label.config(text=f"Selected file: {self.file_path}") self.load_file() def load_file(self): if self.file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): with open(self.file_path, "rb") as image_file: self.image_data = base64.b64encode(image_file.read()).decode('utf-8') elif self.file_path.lower().endswith('.pdf'): pdf_document = fitz.open(self.file_path) first_page = pdf_document[0] image = first_page.get_pixmap() img = Image.frombytes("RGB", [image.width, image.height], image.samples) buffer = io.BytesIO() img.save(buffer, format="PNG") self.image_data = base64.b64encode(buffer.getvalue()).decode('utf-8') pdf_document.close() def submit_question(self): if not self.image_data: self.response_text.delete('1.0', tk.END) # Clear previous response self.response_text.insert(tk.END, "Please upload an image or PDF first.\n") return question = self.question_entry.get('1.0', tk.END).strip() # Get text from Text widget # Clear previous response self.response_text.delete('1.0', tk.END) # Start processing animation self.processing = True threading.Thread(target=self.processing_animation).start() # Run the Llama model in a separate thread threading.Thread(target=self.run_llama_model, args=(question,)).start() def processing_animation(self): animation = "|/-\\" i = 0 while self.processing: self.response_text.delete('1.0', tk.END) self.response_text.insert(tk.END, f"Processing {animation[i % len(animation)]}") self.master.update_idletasks() time.sleep(0.1) i += 1 def run_llama_model(self, question): try: response = ollama.chat( model='llama3.2-vision', messages=[{ 'role': 'user', 'content': question, 'images': [self.image_data] }] ) self.processing = False self.master.after(0, self.update_response, question, response['message']['content']) except Exception as e: self.processing = False self.master.after(0, self.update_response, question, f"Error: {str(e)}") def update_response(self, question, answer): self.response_text.delete('1.0', tk.END) self.response_text.insert(tk.END, f"Q: {question}\nA: {answer}\n\n") root = tk.Tk() app = LlamaVisionApp(root) root.mainloop()