manuelcozar55 commited on
Commit
23a1957
1 Parent(s): ba072b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -58
app.py CHANGED
@@ -1,69 +1,168 @@
1
- from huggingface_hub import InferenceClient
2
- import gradio as gr
3
- import json
4
  import PyPDF2
 
 
 
 
 
 
 
5
 
6
- client = InferenceClient(
7
- "mistralai/Mistral-7B-Instruct-v0.3"
 
 
 
 
 
8
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def format_prompt(mode, message, instructions, history):
11
- prompt = f"<s>[MODE] {mode} [/MODE] "
12
- for user_prompt, bot_response in history:
13
- prompt += f"[INST] {user_prompt} [/INST] {bot_response}</s> "
14
- prompt += f"[INST] {message} {instructions} [/INST]"
15
- return prompt
16
-
17
- def process_input(file, file_type):
18
- if file_type == 'pdf':
19
- reader = PyPDF2.PdfFileReader(file.name)
20
- text = ""
21
- for page in range(reader.numPages):
22
- text += reader.getPage(page).extract_text()
23
- return text
24
- elif file_type == 'json':
25
- with open(file.name, 'r') as f:
26
- data = json.load(f)
27
- return json.dumps(data, indent=2)
28
- return file
29
-
30
- def generate(mode, file, file_type, instructions, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
31
- temperature = max(float(temperature), 1e-2)
32
- top_p = float(top_p)
33
 
34
- text_input = process_input(file, file_type)
35
- formatted_prompt = format_prompt(mode, text_input, instructions, history)
36
-
37
- generate_kwargs = dict(
38
- temperature=temperature,
39
- max_new_tokens=max_new_tokens,
40
- top_p=top_p,
41
- repetition_penalty=repetition_penalty,
42
- do_sample=True,
43
- seed=42,
44
  )
 
 
 
 
 
45
 
46
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
47
- output = ""
 
 
 
 
 
 
 
48
 
49
- for response in stream:
50
- output += response.token.text
51
- yield output
52
- return output
 
 
 
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  with gr.Blocks() as demo:
55
- mode = gr.Dropdown(label="Mode", choices=["translation", "summary", "explanation"], value="translation")
56
- file = gr.File(label="Input File", type="file")
57
- file_type = gr.Radio(label="File Type", choices=["pdf", "json"], value="pdf")
58
- instructions = gr.Textbox(label="Additional Instructions", placeholder="Enter any additional instructions here")
59
- chatbot = gr.Chatbot()
60
-
61
- def update_chatbot(mode, file, file_type, instructions, history):
62
- return generate(mode, file, file_type, instructions, history)
 
 
 
 
 
 
 
 
63
 
64
- gr.Interface(
65
- fn=update_chatbot,
66
- inputs=[mode, file, file_type, instructions, chatbot],
67
- outputs=chatbot,
68
- title="Mistral 7B v0.3"
69
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.simplefilter(action='ignore', category=FutureWarning)
3
+
4
  import PyPDF2
5
+ import gradio as gr
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from pathlib import Path
9
+ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+ import torch
12
 
13
+ # Configuración del modelo de resumen
14
+ llm = HuggingFaceEndpoint(
15
+ repo_id="mistralai/Mistral-7B-Instruct-v0.3",
16
+ task="text-generation",
17
+ max_new_tokens=4096,
18
+ temperature=0.5,
19
+ do_sample=False,
20
  )
21
+ llm_engine_hf = ChatHuggingFace(llm=llm)
22
+
23
+ # Configuración del modelo de clasificación
24
+ tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
25
+ model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
26
+
27
+ id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
28
+
29
+ def read_pdf(file_path):
30
+ pdf_reader = PyPDF2.PdfReader(file_path)
31
+ text = ""
32
+ for page in range(len(pdf_reader.pages)):
33
+ text += pdf_reader.pages[page].extract_text()
34
+ return text
35
 
36
+ def summarize(file):
37
+ # Leer el contenido del archivo subido
38
+ file_path = file.name
39
+ if file_path.endswith('.pdf'):
40
+ text = read_pdf(file_path)
41
+ else:
42
+ with open(file_path, 'r', encoding='utf-8') as f:
43
+ text = f.read()
44
+
45
+ template = '''
46
+ Please carefully read the following document:
47
+ <document>
48
+ {TEXT}
49
+ </document>
50
+ After reading through the document, identify the key points and main ideas covered in the text. Organize these key points into a concise bulleted list that summarizes the essential information from the document. The summary should have a maximum of 10 bullet points.
51
+ Your goal is to be comprehensive in capturing the core content of the document, while also being concise in how you express each summary point. Omit minor details and focus on the central themes and important facts.
52
+ '''
 
 
 
 
 
 
53
 
54
+ prompt = PromptTemplate(
55
+ template=template,
56
+ input_variables=['TEXT']
 
 
 
 
 
 
 
57
  )
58
+
59
+ formatted_prompt = prompt.format(TEXT=text)
60
+ output_summary = llm_engine_hf.invoke(formatted_prompt)
61
+
62
+ return output_summary.content
63
 
64
+ def classify_text(text):
65
+ inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
66
+ model.eval()
67
+ with torch.no_grad():
68
+ outputs = model(**inputs)
69
+ logits = outputs.logits
70
+ predicted_class_id = logits.argmax(dim=-1).item()
71
+ predicted_label = id2label[predicted_class_id]
72
+ return predicted_label
73
 
74
+ def translate(file, target_language):
75
+ # Leer el contenido del archivo subido
76
+ file_path = file.name
77
+ if file_path.endswith('.pdf'):
78
+ text = read_pdf(file_path)
79
+ else:
80
+ with open(file_path, 'r', encoding='utf-8') as f:
81
+ text = f.read()
82
 
83
+ template = '''
84
+ Please translate the following document to {LANGUAGE}:
85
+ <document>
86
+ {TEXT}
87
+ </document>
88
+ Ensure that the translation is accurate and preserves the original meaning of the document.
89
+ '''
90
+
91
+ prompt = PromptTemplate(
92
+ template=template,
93
+ input_variables=['TEXT', 'LANGUAGE']
94
+ )
95
+
96
+ formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language)
97
+ translated_text = llm_engine_hf.invoke(formatted_prompt)
98
+
99
+ return translated_text.content
100
+
101
+ def process_file(file, action, target_language=None):
102
+ if action == "Resumen":
103
+ return summarize(file)
104
+ elif action == "Clasificar":
105
+ file_path = file.name
106
+ if file_path.endswith('.pdf'):
107
+ text = read_pdf(file_path)
108
+ else:
109
+ with open(file_path, 'r', encoding='utf-8') as f:
110
+ text = f.read()
111
+ return classify_text(text)
112
+ elif action == "Traducir":
113
+ return translate(file, target_language)
114
+ else:
115
+ return "Acción no válida"
116
+
117
+ def download_text(output_text, filename='output.txt'):
118
+ if output_text:
119
+ file_path = Path(filename)
120
+ with open(file_path, 'w', encoding='utf-8') as f:
121
+ f.write(output_text)
122
+ return file_path
123
+ else:
124
+ return None
125
+
126
+ def create_download_file(output_text, filename='output.txt'):
127
+ file_path = download_text(output_text, filename)
128
+ return str(file_path) if file_path else None
129
+
130
+ # Crear la interfaz de Gradio
131
  with gr.Blocks() as demo:
132
+ gr.Markdown("## Document Processor")
133
+
134
+ with gr.Row():
135
+ with gr.Column():
136
+ file = gr.File(label="Subir un archivo")
137
+ action = gr.Radio(label="Seleccione una acción", choices=["Resumen", "Clasificar", "Traducir"])
138
+ target_language = gr.Dropdown(label="Seleccionar idioma de traducción", choices=["en", "fr", "de"], visible=False)
139
+
140
+ with gr.Column():
141
+ output_text = gr.Textbox(label="Resultado", lines=20)
142
+
143
+ def update_language_dropdown(action):
144
+ if action == "Traducir":
145
+ return gr.update(visible=True)
146
+ else:
147
+ return gr.update(visible=False)
148
 
149
+ action.change(update_language_dropdown, inputs=action, outputs=target_language)
150
+
151
+ submit_button = gr.Button("Procesar")
152
+ submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text)
153
+
154
+ def generate_file():
155
+ summary_text = output_text.value
156
+ filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt'
157
+ file_path = download_text(summary_text, filename)
158
+ return file_path
159
+
160
+ download_button = gr.Button("Descargar Resultado")
161
+ download_button.click(
162
+ fn=generate_file,
163
+ inputs=[],
164
+ outputs=gr.File()
165
+ )
166
+
167
+ # Ejecutar la aplicación Gradio
168
+ demo.launch(share=True)