Spaces:
Sleeping
Sleeping
# Code adapted from https://github.com/AIAdvantage/chatgpt-api-youtube | |
# To see how to create a virtual environment, check https://python.land/virtual-environments/virtualenv | |
# python -m venv my-envi, and the next steps, see https://stackoverflow.com/a/74825209 | |
# For error of installation due to privileges, see https://stackoverflow.com/questions/66322049/could-not-install-packages-due-to-an-oserror-winerror-2-no-such-file-or-direc | |
# For gitignore, see https://github.com/github/gitignore/blob/main/Python.gitignore | |
# If VS Code shows issues about execution policies, you may need to change the execution policies settings in the powershell, see https://www.sharepointdiary.com/2014/03/fix-for-powershell-script-cannot-be-loaded-because-running-scripts-is-disabled-on-this-system.html | |
import os | |
import my_api_keys | |
import gradio as gr | |
from llama_index import ( | |
GPTSimpleVectorIndex, | |
SimpleDirectoryReader, | |
LLMPredictor, | |
ServiceContext, | |
download_loader, | |
PromptHelper | |
) | |
from llama_index.prompts.prompts import QuestionAnswerPrompt | |
# documentation of langchain at https://github.com/hwchase17/langchain | |
from langchain.chat_models import ChatOpenAI | |
from langchain import OpenAI # if you want to use a model other than gpt-3.5-turbo | |
os.environ['OPENAI_API_KEY'] = my_api_keys.my_open_ai_key | |
''' | |
What is this document about | |
Which countries were affected | |
How many people injured | |
When did the earthquake take place | |
What is the president | |
what is the data of birth of Germany => Should return no answer | |
''' | |
def custom_llama_index (question): | |
## Working with llama_index = playing around with data augmentation | |
## Step 1: load the new data | |
# documentation of llama_index at https://gpt-index.readthedocs.io/en/latest/ | |
# data loaders at https://llamahub.ai/ | |
#from llama_index import download_loader, GPTSimpleVectorIndex | |
SimpleDirectoryReader = download_loader("SimpleDirectoryReader") | |
# Take all the files in the data folder, see https://llamahub.ai/l/file | |
loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) | |
documents = loader.load_data() | |
#print(documents) | |
## Step 2: Build a CUSTOM llm index: code adapted from https://github.com/wombyz/custom-knowledge-chatbot/tree/main/custom-knowledge-chatbot | |
# Official documentation: https://gpt-index.readthedocs.io/en/latest/how_to/customization/custom_llms.html | |
# define prompt helper | |
# set maximum input size | |
max_input_size = 2048 | |
# set number of output tokens | |
num_output = 256 | |
# set maximum chunk overlap | |
max_chunk_overlap = 20 | |
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap) | |
# define LLM | |
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="text-davinci-002")) | |
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) | |
# build index | |
custom_index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) | |
## Step 3: reuse the custom index to get some answers | |
# get response from query | |
response = custom_index.query(question) | |
# If we want to include prompt-engineering | |
# Code from https://www.linkedin.com/pulse/extending-chatgpt-knowledge-base-custom-datasources-cezar-romaniuc | |
QUESTION_ANSWER_PROMPT_TMPL = ( | |
"You are an assistant that specializes in geographic question answering. If you don't have an answer, answer with 'I don't know' \n" | |
"---------------------\n" | |
"{context_str}" | |
"\n---------------------\n" | |
"{query_str}\n" | |
) | |
QUESTION_ANSWER_PROMPT = QuestionAnswerPrompt(QUESTION_ANSWER_PROMPT_TMPL) | |
response_with_custom_prompt = custom_index.query(question, text_qa_template=QUESTION_ANSWER_PROMPT) | |
return response_with_custom_prompt | |
demo = gr.Interface(fn=custom_llama_index, inputs="text", outputs="text") | |
demo.launch() |