import datetime import json from typing import List, Tuple from langchain_community.retrievers import BM25Retriever from langchain_core.documents import Document import streamlit as st from sudachipy import dictionary, tokenizer def generate_word_ngrams( text: str, min_len: int, max_len: int, binary: bool = False ) -> List[Tuple[str, ...]]: """ Tokenize the input text into words and generate n-grams of specified lengths. Args: text (str): The input string. min_len (int): The minimum length of the n-grams. max_len (int): The maximum length of the n-grams. binary (bool, optional): If True, remove duplicates. Defaults to False. Returns: List[Tuple[str, ...]]: A list of n-grams as tuples of words. """ tokenizer_obj = dictionary.Dictionary(dict="full").create() mode = tokenizer.Tokenizer.SplitMode.A tokens = tokenizer_obj.tokenize(text, mode) words = [token.surface() for token in tokens] ngrams: List[Tuple[str, ...]] = [] for n in range(min_len, max_len + 1): for k in range(len(words) - n + 1): ngram = tuple(words[k:k + n]) ngrams.append(ngram) if binary: ngrams = list(set(ngrams)) # Remove duplicates return ngrams def preprocess_func(text: str) -> List[str]: ngrams = generate_word_ngrams(text, 1, 1, True) return [' '.join(ngram) for ngram in ngrams] def load_docs_from_json(json_path): with open(json_path) as f: papers = json.load(f) docs = [] for paper in papers: page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}" doc = Document( page_content=page_content, metadata={ 'session_id': paper['session_id'], 'session_title': paper['session_title'], 'session_info': paper['session_info'], 'id': paper['pid'], 'title': paper['ptitle'], 'pdf_link': paper['pdf_link'], 'authors': paper['pauthors'], } ) docs.append(doc) return docs # init json_path = "nlp2024_papers.json" docs = load_docs_from_json(json_path) retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func) retriever.k = 10 # streamlit st.title("NLP2024 Papers Search") st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).") st.markdown(f"Nmber of documents: `{len(docs)}`.") st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"") prompt = st.chat_input("Search anything...") if prompt: results = retriever.invoke(prompt) st.markdown(f"Top `{len(results)}` related papers") for result in results: with st.expander(label=result.metadata['title'], expanded=False): for k in result.metadata: st.write(f"{k}: {result.metadata[k]}") st.divider() st.markdown(result.page_content)