import streamlit as st from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import TextLoader from langchain.document_loaders import Docx2txtLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from huggingface_hub import notebook_login import torch import transformers from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import pipeline from langchain import HuggingFacePipeline from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain.embeddings.openai import OpenAIEmbeddings from langchain.chat_models import ChatOpenAI import os import sys # Create a directory for documents if it doesn't exist if not os.path.exists("docs"): os.makedirs("docs") # Define a function to load documents from the "docs" directory def load_documents(): document = [] for file in os.listdir("docs"): if file.endswith(".pdf"): pdf_path = "./docs/" + file loader = PyPDFLoader(pdf_path) document.extend(loader.load()) elif file.endswith('.docx') or file.endswith('.doc'): doc_path = "./docs/" + file loader = Docx2txtLoader(doc_path) document.extend(loader.load()) elif file.endswith('.txt'): text_path = "./docs/" + file loader = TextLoader(text_path) document.extend(loader.load()) return document # Load documents document = load_documents() # Split documents into chunks document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100) document_chunks = document_splitter.split_documents(document) # Initialize embeddings embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # Set OpenAI API key os.environ["OPENAI_API_KEY"] = "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" # Initialize Chroma as the vector database vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory='./data') vectordb.persist() # Login to Hugging Face Hub notebook_login() # Initialize tokenizer and model for text generation tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", use_auth_token=True) model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16, device_map="auto") # Initialize the text generation pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map='auto', max_new_tokens=512, min_new_tokens=-1, top_k=30) # Initialize the conversational retrieval chain llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': 0}) llm = ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo') memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) pdf_qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectordb.as_retriever(search_kwargs={'k': 6}), verbose=False, memory=memory) # Streamlit app st.title('DocBot - Your Document Query Assistant') st.write('Upload your documents to get started.') uploaded_files = st.file_uploader("Upload Files", type=['pdf', 'docx', 'doc', 'txt'], accept_multiple_files=True) if uploaded_files: st.write("Uploaded Files:") for file in uploaded_files: with open(os.path.join("docs", file.name), "wb") as f: f.write(file.getbuffer()) st.write("Files uploaded successfully. You can start asking questions now.") while True: query = st.text_input("Ask a question:") if query: result = pdf_qa({"question": query}) st.write("Answer: " + result["answer"])