|
import http.client as http_client |
|
import json |
|
import logging |
|
import os |
|
import pprint |
|
import re |
|
import time |
|
import string |
|
|
|
import streamlit as st |
|
|
|
import streamlit.components.v1 as components |
|
from typing import Callable, Optional, Tuple, Union |
|
from pyserini import util |
|
from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder |
|
|
|
|
|
VERSION = '1.0' |
|
st.set_page_config(page_title="Miracl Search - Arabic", layout="wide") |
|
|
|
os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True) |
|
with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file: |
|
file.write( |
|
'[theme]\nbase="light"' |
|
) |
|
|
|
Searcher = Union[FaissSearcher, LuceneSearcher] |
|
LANG_MAPPING = {'Arabic':'ar'} |
|
|
|
|
|
st.sidebar.markdown( |
|
""" |
|
<style> |
|
.aligncenter { |
|
text-align: center; |
|
font-weight: bold; |
|
font-size: 30px; |
|
} |
|
</style> |
|
<p class="aligncenter">MIRACL Arabic Demo</p> |
|
<p class="aligncenter">πππ</p> |
|
<p style="text-align: center;"> MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.</p> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
st.sidebar.markdown( |
|
""" |
|
<style> |
|
.aligncenter { |
|
text-align: center; |
|
} |
|
</style> |
|
<p style='text-align: center'> |
|
<a href="https://github.com/project-miracl" >GitHub</a> | <a href="https://arxiv.org/abs/2210.09984" >Paper</a> |
|
</p> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
query = st.sidebar.text_input(label='Search query', value='') |
|
language = 'Arabic' |
|
|
|
max_results = st.sidebar.slider( |
|
"Maximum Number of Results", |
|
min_value=1, |
|
max_value=1000, |
|
step=1, |
|
value=10, |
|
help="Maximum Number of Documents to return", |
|
) |
|
|
|
|
|
def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher): |
|
searcher = LuceneSearcher(f'lucene-index.miracl-v1.0-{language}.20221004.2b2856') |
|
searcher.set_language(language) |
|
if k1 is not None and b is not None: |
|
searcher.set_bm25(k1, b) |
|
retriever_name = f'BM25 (k1={k1}, b={b})' |
|
else: |
|
retriever_name = 'BM25' |
|
|
|
return searcher |
|
|
|
def search(query, language, num_results=10): |
|
searcher = _load_sparse_searcher(language=LANG_MAPPING[language]) |
|
|
|
t_0 = time.time() |
|
search_results = searcher.search(query, k=num_results) |
|
search_time = time.time() - t_0 |
|
|
|
results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language} |
|
for i, result in enumerate(search_results): |
|
result = json.loads(result.raw) |
|
results_dict["docs"].append(result["text"]) |
|
results_dict["doc_ids"].append(result["docid"]) |
|
results_dict["score"].append(search_results[i].score) |
|
|
|
return results_dict, search_time |
|
|
|
|
|
|
|
def highlight_string(paragraph: str, highlight_terms: list) -> str: |
|
for term in highlight_terms: |
|
paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I) |
|
return paragraph |
|
|
|
def process_results(hits: dict, highlight_terms: list) -> str: |
|
hit_list = [] |
|
for i in range(len(hits['doc_ids'])): |
|
res_head = f""" |
|
<div class='searchresult'> |
|
<h2>{i+1}. Document ID: {hits['doc_ids'][i]}</h2> |
|
<p>Language: <string>{hits['lang']}</string>, Score: {round(hits['score'][i], 2)}</p> |
|
<p>{highlight_string(hits['docs'][i], highlight_terms)}</p> |
|
</div> |
|
<hr> |
|
""" |
|
hit_list.append(res_head) |
|
return " ".join(hit_list) |
|
|
|
|
|
|
|
if st.sidebar.button("Search"): |
|
hits, search_time = search(query, language, max_results) |
|
html_results = process_results(hits, []) |
|
rendered_results = f""" |
|
<div id="searchresultsarea"> |
|
<br> |
|
<p id="searchresultsnumber">About {max_results} results</p> |
|
{html_results} |
|
</div> |
|
""" |
|
st.markdown(""" |
|
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" |
|
integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous"> |
|
""", |
|
unsafe_allow_html=True) |
|
st.markdown( |
|
""" |
|
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> |
|
""", |
|
unsafe_allow_html=True) |
|
st.markdown( |
|
f""" |
|
<div class="row no-gutters mt-3 align-items-center"> |
|
<h2> Search Results </h2> |
|
</div> |
|
""", |
|
unsafe_allow_html=True) |
|
components.html( |
|
""" |
|
<style> |
|
#searchresultsarea { |
|
font-family: 'Arial'; |
|
} |
|
|
|
#searchresultsnumber { |
|
font-size: 0.8rem; |
|
color: gray; |
|
} |
|
|
|
.searchresult h2 { |
|
font-size: 19px; |
|
line-height: 18px; |
|
font-weight: normal; |
|
color: rgb(7, 111, 222); |
|
margin-bottom: 0px; |
|
margin-top: 25px; |
|
} |
|
|
|
.searchresult a { |
|
font-size: 12px; |
|
line-height: 12px; |
|
color: green; |
|
margin-bottom: 0px; |
|
} |
|
|
|
.dark-mode { |
|
color: white; |
|
} |
|
</style> |
|
<script> |
|
function load_image(id){ |
|
console.log(id) |
|
var x = document.getElementById(id); |
|
console.log(x) |
|
if (x.style.display === "none") { |
|
x.style.display = "block"; |
|
} else { |
|
x.style.display = "none"; |
|
} |
|
}; |
|
function myFunction() { |
|
var element = document.body; |
|
element.classList.toggle("dark-mode"); |
|
} |
|
</script> |
|
<button onclick="myFunction()">Toggle dark mode</button> |
|
""" + rendered_results, height=800, scrolling=True |
|
) |
|
|
|
|
|
|