import streamlit as st from streamlit_elements import elements, mui, editor, dashboard from stqdm import stqdm import textgrad as tg import os from PIL import Image from textgrad.autograd import MultimodalLLMCall from textgrad.loss import ImageQALoss from io import BytesIO class ImageQA: def __init__(self, data) -> None: self.data = data self.llm_engine = tg.get_engine("gpt-4o") print("="*50, "init", "="*50) self.loss_value = "" self.gradients = "" if 'iteration' not in st.session_state: st.session_state.iteration = 0 st.session_state.results = [] tg.set_backward_engine(self.llm_engine, override=True) def load_layout(self): st.markdown(f"**This is a solution optimization for image QA.**") col1, col2 = st.columns([1, 1]) with col1: uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image") else: image_url = self.data["image_URL"] image = Image.open(image_url) st.image(image_url, caption="Default: MathVista image") img_byte_arr = BytesIO() image.save(img_byte_arr, format='PNG') # You can choose the format you want img_byte_arr = img_byte_arr.getvalue() self.image_variable = tg.Variable(img_byte_arr, role_description="image to answer a question about", requires_grad=False) with col2: question_text = st.text_area("Question:", self.data["question_text"], height=150) self.question_variable = tg.Variable(question_text, role_description="question", requires_grad=False) self.evaluation_instruction_text = st.text_area("Evaluation instruction:", self.data["evaluation_instruction"], height=100) self.loss_fn = ImageQALoss( evaluation_instruction=self.evaluation_instruction_text, engine="gpt-4o", ) if "current_response" not in st.session_state: st.session_state.current_response = "" def _run(self): # Set up the textgrad variables self.response = MultimodalLLMCall("gpt-4o")([ self.image_variable, self.question_variable ]) optimizer = tg.TGD(parameters=[self.response]) loss = self.loss_fn(question=self.question_variable, image=self.image_variable, response=self.response) self.loss_value = loss.value # self.graph = loss.generate_graph() loss.backward() self.gradients = self.response.gradients optimizer.step() # Let's update the response st.session_state.current_response = self.response.value def show_results(self): self._run() st.session_state.iteration += 1 st.session_state.results.append({ 'iteration': st.session_state.iteration, 'loss_value': self.loss_value, 'response': self.response.value, 'gradients': self.gradients }) tabs = st.tabs([f"Iteration {i+1}" for i in range(st.session_state.iteration)]) for i, tab in enumerate(tabs): with tab: result = st.session_state.results[i] st.markdown(f"Current iteration: **{result['iteration']}**") st.markdown("## Current solution:") st.markdown(result['response']) col1, col2 = st.columns([1, 1]) with col1: st.markdown("## Loss value") st.markdown(result['loss_value']) with col2: st.markdown("## Code gradients") for j, g in enumerate(result['gradients']): st.markdown(f"### Gradient") st.markdown(g.value)