import os import re from fastapi import FastAPI, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel from langchain_community.llms import Ollama from langchain_core.messages import HumanMessage import logging from functools import lru_cache from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI() MODEL_NAME = 'tinyllama' @lru_cache() def get_llm(): callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) return Ollama(model=MODEL_NAME, callback_manager=callback_manager) class Question(BaseModel): text: str @app.get("/") def read_root(): return {"Hello": f"Welcome to {MODEL_NAME} FastAPI"} @app.post("/ask") async def ask_question(question: Question): try: logger.info(f"Received question: {question.text}") llm = get_llm() response = llm.invoke(question.text) logger.info("Response generated successfully") return {"answer": response} except Exception as e: logger.error(f"Error in /ask endpoint: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/ask_stream") async def ask_question_stream(question: Question): try: logger.info(f"Received question for streaming: {question.text}") llm = get_llm() async def generate(): full_response = "" async for chunk in llm.astream(question.text): full_response += chunk yield chunk # Log the full response after streaming is complete logger.info(f"Full streamed response: {full_response}") return StreamingResponse(generate(), media_type="text/plain") except Exception as e: logger.error(f"Error in /ask_stream endpoint: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @app.on_event("startup") async def startup_event(): logger.info(f"Starting up with model: {MODEL_NAME}") # Warm up the cache get_llm() @app.on_event("shutdown") async def shutdown_event(): logger.info("Shutting down")