|
import langid |
|
import os |
|
from haystack import Pipeline |
|
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader |
|
from haystack.document_stores import InMemoryDocumentStore |
|
from haystack.utils import print_answers |
|
from deep_translator import GoogleTranslator |
|
|
|
class Sejarah: |
|
def __init__(self): |
|
|
|
document_store = InMemoryDocumentStore(use_bm25=True) |
|
|
|
|
|
indexing_pipeline = Pipeline() |
|
text_converter = TextConverter() |
|
preprocessor = PreProcessor( |
|
clean_whitespace=True, |
|
clean_header_footer=True, |
|
clean_empty_lines=True, |
|
split_by="word", |
|
split_length=200, |
|
split_overlap=20, |
|
split_respect_sentence_boundary=True, |
|
) |
|
|
|
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"]) |
|
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"]) |
|
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"]) |
|
|
|
dir = "documents" |
|
|
|
files_to_index = [dir+"/" + f for f in os.listdir(dir)] |
|
indexing_pipeline.run_batch(file_paths=files_to_index) |
|
|
|
retriever = BM25Retriever(document_store=document_store) |
|
reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True) |
|
|
|
self.querying_pipeline = Pipeline() |
|
self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) |
|
self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"]) |
|
|
|
|
|
def language_converter(self, content, lang, method): |
|
|
|
if lang == "en": |
|
if method == "question": |
|
new_content = GoogleTranslator(source='en', target='ms').translate(content) |
|
|
|
if "when" in content: |
|
new_content = new_content.replace("apabila","bila") |
|
else: |
|
new_content = GoogleTranslator(source='ms', target='en').translate(content) |
|
else: |
|
new_content = content |
|
|
|
return new_content |
|
|
|
|
|
def detect_language(self, content): |
|
lang = langid.classify(content) |
|
return lang[0] |
|
|
|
|
|
def interface(self, question): |
|
language = self.detect_language(question) |
|
|
|
converted_question = self.language_converter(question, language, "question") |
|
|
|
result = self.querying_pipeline.run( |
|
query=converted_question, |
|
params={ |
|
"Retriever": {"top_k": 10}, |
|
"Reader": {"top_k": 5} |
|
} |
|
) |
|
|
|
answer = self.language_converter(result['answers'][0].answer, language, "answer") |
|
context = self.language_converter(result['answers'][0].context, language, "answer") |
|
|
|
return answer, context |