Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
from typing import List, Tuple
|
4 |
+
|
5 |
+
from langchain_community.retrievers import BM25Retriever
|
6 |
+
from langchain_core.documents import Document
|
7 |
+
import streamlit as st
|
8 |
+
from sudachipy import dictionary, tokenizer
|
9 |
+
|
10 |
+
|
11 |
+
def generate_word_ngrams(
|
12 |
+
text: str, min_len: int, max_len: int, binary: bool = False
|
13 |
+
) -> List[Tuple[str, ...]]:
|
14 |
+
"""
|
15 |
+
Tokenize the input text into words and generate n-grams of specified lengths.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
text (str): The input string.
|
19 |
+
min_len (int): The minimum length of the n-grams.
|
20 |
+
max_len (int): The maximum length of the n-grams.
|
21 |
+
binary (bool, optional): If True, remove duplicates. Defaults to False.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
List[Tuple[str, ...]]: A list of n-grams as tuples of words.
|
25 |
+
"""
|
26 |
+
tokenizer_obj = dictionary.Dictionary(dict="full").create()
|
27 |
+
mode = tokenizer.Tokenizer.SplitMode.A
|
28 |
+
tokens = tokenizer_obj.tokenize(text, mode)
|
29 |
+
words = [token.surface() for token in tokens]
|
30 |
+
|
31 |
+
ngrams: List[Tuple[str, ...]] = []
|
32 |
+
|
33 |
+
for n in range(min_len, max_len + 1):
|
34 |
+
for k in range(len(words) - n + 1):
|
35 |
+
ngram = tuple(words[k:k + n])
|
36 |
+
ngrams.append(ngram)
|
37 |
+
|
38 |
+
if binary:
|
39 |
+
ngrams = list(set(ngrams)) # Remove duplicates
|
40 |
+
|
41 |
+
return ngrams
|
42 |
+
|
43 |
+
|
44 |
+
def preprocess_func(text: str) -> List[str]:
|
45 |
+
ngrams = generate_word_ngrams(text, 1, 1, True)
|
46 |
+
return [' '.join(ngram) for ngram in ngrams]
|
47 |
+
|
48 |
+
|
49 |
+
def load_docs_from_json(json_path):
|
50 |
+
with open(json_path) as f:
|
51 |
+
papers = json.load(f)
|
52 |
+
|
53 |
+
docs = []
|
54 |
+
for paper in papers:
|
55 |
+
page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}"
|
56 |
+
doc = Document(
|
57 |
+
page_content=page_content,
|
58 |
+
metadata={
|
59 |
+
'session_id': paper['session_id'],
|
60 |
+
'session_title': paper['session_title'],
|
61 |
+
'session_info': paper['session_info'],
|
62 |
+
'id': paper['pid'],
|
63 |
+
'title': paper['ptitle'],
|
64 |
+
'pdf_link': paper['pdf_link'],
|
65 |
+
'authors': paper['pauthors'],
|
66 |
+
}
|
67 |
+
)
|
68 |
+
docs.append(doc)
|
69 |
+
|
70 |
+
return docs
|
71 |
+
|
72 |
+
|
73 |
+
# init
|
74 |
+
json_path = "nlp2024_papers.json"
|
75 |
+
docs = load_docs_from_json(json_path)
|
76 |
+
retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)
|
77 |
+
retriever.k = 10
|
78 |
+
|
79 |
+
# streamlit
|
80 |
+
st.title("NLP2024 Papers Search")
|
81 |
+
st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).")
|
82 |
+
st.markdown(f"Nmber of documents: `{len(docs)}`.")
|
83 |
+
st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"")
|
84 |
+
|
85 |
+
prompt = st.chat_input("Search anything...")
|
86 |
+
|
87 |
+
if prompt:
|
88 |
+
results = retriever.invoke(prompt)
|
89 |
+
|
90 |
+
st.markdown(f"Top `{len(results)}` related papers")
|
91 |
+
|
92 |
+
for result in results:
|
93 |
+
with st.expander(label=result.metadata['title'], expanded=False):
|
94 |
+
for k in result.metadata:
|
95 |
+
st.write(f"{k}: {result.metadata[k]}")
|
96 |
+
st.divider()
|
97 |
+
st.markdown(result.page_content)
|