yutohub commited on
Commit
749fb56
1 Parent(s): 79bbe30

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ from typing import List, Tuple
4
+
5
+ from langchain_community.retrievers import BM25Retriever
6
+ from langchain_core.documents import Document
7
+ import streamlit as st
8
+ from sudachipy import dictionary, tokenizer
9
+
10
+
11
+ def generate_word_ngrams(
12
+ text: str, min_len: int, max_len: int, binary: bool = False
13
+ ) -> List[Tuple[str, ...]]:
14
+ """
15
+ Tokenize the input text into words and generate n-grams of specified lengths.
16
+
17
+ Args:
18
+ text (str): The input string.
19
+ min_len (int): The minimum length of the n-grams.
20
+ max_len (int): The maximum length of the n-grams.
21
+ binary (bool, optional): If True, remove duplicates. Defaults to False.
22
+
23
+ Returns:
24
+ List[Tuple[str, ...]]: A list of n-grams as tuples of words.
25
+ """
26
+ tokenizer_obj = dictionary.Dictionary(dict="full").create()
27
+ mode = tokenizer.Tokenizer.SplitMode.A
28
+ tokens = tokenizer_obj.tokenize(text, mode)
29
+ words = [token.surface() for token in tokens]
30
+
31
+ ngrams: List[Tuple[str, ...]] = []
32
+
33
+ for n in range(min_len, max_len + 1):
34
+ for k in range(len(words) - n + 1):
35
+ ngram = tuple(words[k:k + n])
36
+ ngrams.append(ngram)
37
+
38
+ if binary:
39
+ ngrams = list(set(ngrams)) # Remove duplicates
40
+
41
+ return ngrams
42
+
43
+
44
+ def preprocess_func(text: str) -> List[str]:
45
+ ngrams = generate_word_ngrams(text, 1, 1, True)
46
+ return [' '.join(ngram) for ngram in ngrams]
47
+
48
+
49
+ def load_docs_from_json(json_path):
50
+ with open(json_path) as f:
51
+ papers = json.load(f)
52
+
53
+ docs = []
54
+ for paper in papers:
55
+ page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}"
56
+ doc = Document(
57
+ page_content=page_content,
58
+ metadata={
59
+ 'session_id': paper['session_id'],
60
+ 'session_title': paper['session_title'],
61
+ 'session_info': paper['session_info'],
62
+ 'id': paper['pid'],
63
+ 'title': paper['ptitle'],
64
+ 'pdf_link': paper['pdf_link'],
65
+ 'authors': paper['pauthors'],
66
+ }
67
+ )
68
+ docs.append(doc)
69
+
70
+ return docs
71
+
72
+
73
+ # init
74
+ json_path = "nlp2024_papers.json"
75
+ docs = load_docs_from_json(json_path)
76
+ retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)
77
+ retriever.k = 10
78
+
79
+ # streamlit
80
+ st.title("NLP2024 Papers Search")
81
+ st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).")
82
+ st.markdown(f"Nmber of documents: `{len(docs)}`.")
83
+ st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"")
84
+
85
+ prompt = st.chat_input("Search anything...")
86
+
87
+ if prompt:
88
+ results = retriever.invoke(prompt)
89
+
90
+ st.markdown(f"Top `{len(results)}` related papers")
91
+
92
+ for result in results:
93
+ with st.expander(label=result.metadata['title'], expanded=False):
94
+ for k in result.metadata:
95
+ st.write(f"{k}: {result.metadata[k]}")
96
+ st.divider()
97
+ st.markdown(result.page_content)