import pkg_resources from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import gradio as gr from ferret import Benchmark # Load models and tokenizers sentiment_tokenizer = AutoTokenizer.from_pretrained("Birkir/electra-base-igc-is-sentiment-analysis") sentiment_model = AutoModelForSequenceClassification.from_pretrained("Birkir/electra-base-igc-is-sentiment-analysis") formality_tokenizer = AutoTokenizer.from_pretrained("svanhvit/formality-classification-icebert") formality_model = AutoModelForSequenceClassification.from_pretrained("svanhvit/formality-classification-icebert") toxicity_tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert") toxicity_model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert") politeness_tokenizer = AutoTokenizer.from_pretrained("Genius1237/xlm-roberta-large-tydip") politeness_model = AutoModelForSequenceClassification.from_pretrained("Genius1237/xlm-roberta-large-tydip") # Initialize benchmarks sentiment_bench = Benchmark(sentiment_model, sentiment_tokenizer) formality_bench = Benchmark(formality_model, formality_tokenizer) toxicity_bench = Benchmark(toxicity_model, toxicity_tokenizer) politeness_bench = Benchmark(politeness_model, politeness_tokenizer) # Initialize pipelines for translation and text classification translator = pipeline("translation", model="Helsinki-NLP/opus-mt-is-en") sentiment_classifier = pipeline("text-classification", model="Birkir/electra-base-igc-is-sentiment-analysis") formality_classifier = pipeline("text-classification", model="svanhvit/formality-classification-icebert") detoxify_classifier = pipeline('text-classification', model='unitary/toxic-bert', tokenizer='bert-base-uncased', function_to_apply='sigmoid', top_k=None) politeness_classifier = pipeline("text-classification", model="Genius1237/xlm-roberta-large-tydip") def replace_encoding(tokens): return [token.replace('Ġ', ' ') .replace('ð', 'ð') .replace('é', 'é') .replace('æ', 'æ') .replace('ý', 'ý') .replace('á', 'á') .replace('ú', 'ú') .replace('ÃŃ', 'í') .replace('Ãö', 'ö') .replace('þ', 'þ') .replace('Ãģ', 'Á') .replace('Ãį', 'Ú') .replace('Ãĵ', 'Ó') .replace('ÃĨ', 'Æ') .replace('ÃIJ', 'Ð') .replace('Ãĸ', 'Ö') .replace('Ãī', 'É') .replace('Ãļ', 'ý') for token in tokens[1:-1]] def analyze_with_influence(text, bench): explanations = bench.explain(text, target=0) influential_words = [] for explanation in explanations: if explanation.explainer == 'Partition SHAP': tokens = replace_encoding(explanation.tokens) token_score_pairs = zip(tokens, explanation.scores) influential_words.extend([(token, score) for token, score in token_score_pairs]) influential_words_str = "; ".join([f"{token} ({score:.2f})" for token, score in influential_words]) return influential_words_str def analyze_text(icelandic_text): # Perform translations translated_text = translator(icelandic_text, max_length=512)[0]['translation_text'] # Perform initial analysis to get scores sentiment_result = sentiment_classifier(icelandic_text)[0] formality_result = formality_classifier(icelandic_text)[0] # Assuming detoxify_classifier gives a list of dictionaries, we need to adjust how we process this # For the sake of example, let's just mock a toxicity score here. Adjust this based on actual model output toxicity_mock_score = 0.5 # Placeholder, replace with actual processing of detoxify_classifier output politeness_result = politeness_classifier(translated_text)[0] # Gather scores and labels scores_labels = { "Sentiment": (sentiment_result['score'], sentiment_bench), "Formality": (formality_result['score'], formality_bench), "Toxicity": (toxicity_mock_score, toxicity_bench), # Use the mock or processed score "Politeness": (politeness_result['score'], politeness_bench) } # Identify the aspect with the lowest score lowest_aspect = min(scores_labels, key=lambda x: scores_labels[x][0]) # Perform Ferret analysis on the aspect with the lowest score influential_words = analyze_with_influence(icelandic_text if lowest_aspect in ["Sentiment", "Formality"] else translated_text, scores_labels[lowest_aspect][1]) analysis_results = f""" Translated Text: {translated_text}\n\n Lowest Score Aspect: {lowest_aspect}\n Influential Words in {lowest_aspect}: {influential_words} """ return analysis_results.strip() demo = gr.Interface(fn=analyze_text, inputs=gr.Textbox(lines=2, placeholder="Enter Icelandic Text Here..."), outputs=gr.Textbox(label="Analysis Results"), title="Icelandic Text Analysis", description="This app translates Icelandic text to English and performs analysis with influential words for the aspect with the lowest score.") if __name__ == "__main__": demo.launch()