Spaces:

yutohub
/

nlp2024_papers

Sleeping

App Files Files Community

yutohub commited on Jun 29

Commit

749fb56

•

1 Parent(s): 79bbe30

Create app.py

Browse files

Files changed (1) hide show

app.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import datetime
+import json
+from typing import List, Tuple
+from langchain_community.retrievers import BM25Retriever
+from langchain_core.documents import Document
+import streamlit as st
+from sudachipy import dictionary, tokenizer
+def generate_word_ngrams(
+    text: str, min_len: int, max_len: int, binary: bool = False
+) -> List[Tuple[str, ...]]:
+    """
+    Tokenize the input text into words and generate n-grams of specified lengths.
+    Args:
+        text (str): The input string.
+        min_len (int): The minimum length of the n-grams.
+        max_len (int): The maximum length of the n-grams.
+        binary (bool, optional): If True, remove duplicates. Defaults to False.
+    Returns:
+        List[Tuple[str, ...]]: A list of n-grams as tuples of words.
+    """
+    tokenizer_obj = dictionary.Dictionary(dict="full").create()
+    mode = tokenizer.Tokenizer.SplitMode.A
+    tokens = tokenizer_obj.tokenize(text, mode)
+    words = [token.surface() for token in tokens]
+    ngrams: List[Tuple[str, ...]] = []
+    for n in range(min_len, max_len + 1):
+        for k in range(len(words) - n + 1):
+            ngram = tuple(words[k:k + n])
+            ngrams.append(ngram)
+    if binary:
+        ngrams = list(set(ngrams))  # Remove duplicates
+    return ngrams
+def preprocess_func(text: str) -> List[str]:
+    ngrams = generate_word_ngrams(text, 1, 1, True)
+    return [' '.join(ngram) for ngram in ngrams]
+def load_docs_from_json(json_path):
+    with open(json_path) as f:
+        papers = json.load(f)
+    docs = []
+    for paper in papers:
+        page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}"
+        doc = Document(
+            page_content=page_content,
+            metadata={
+                'session_id': paper['session_id'],
+                'session_title': paper['session_title'],
+                'session_info': paper['session_info'],
+                'id': paper['pid'],
+                'title': paper['ptitle'],
+                'pdf_link': paper['pdf_link'],
+                'authors': paper['pauthors'],
+                }
+            )
+        docs.append(doc)
+    return docs
+# init
+json_path = "nlp2024_papers.json"
+docs = load_docs_from_json(json_path)
+retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)
+retriever.k = 10
+# streamlit
+st.title("NLP2024 Papers Search")
+st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).")
+st.markdown(f"Nmber of documents: `{len(docs)}`.")
+st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n  but also with documents like \"How to generate synthetic data using LLM.\"")
+prompt = st.chat_input("Search anything...")
+if prompt:
+    results = retriever.invoke(prompt)
+    st.markdown(f"Top `{len(results)}` related papers")
+    for result in results:
+        with st.expander(label=result.metadata['title'], expanded=False):
+            for k in result.metadata:
+                st.write(f"{k}: {result.metadata[k]}")
+            st.divider()
+            st.markdown(result.page_content)