File size: 7,351 Bytes
6ab9084
 
 
 
 
 
 
 
0723a2c
6ab9084
 
 
 
 
 
 
4133681
 
0723a2c
 
4133681
 
0723a2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ab9084
 
4133681
 
 
 
 
 
 
0723a2c
 
4133681
 
 
 
0723a2c
4133681
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0723a2c
4133681
 
 
 
 
0723a2c
 
 
 
 
 
 
 
4133681
 
 
 
 
0723a2c
 
 
 
 
 
 
 
4133681
0723a2c
4133681
 
 
 
 
 
 
0723a2c
 
 
d79d1b9
0723a2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4133681
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import pysbd
from txtai.embeddings import Embeddings
import networkx as nx
from tqdm import tqdm
from txtai.graph import GraphFactory
from datasets import load_dataset
import streamlit as st
import streamlit.components.v1 as components
import string


st.set_page_config(page_title="DebateKG")
st.title("DebateKG - Automatic Policy Debate Case Creation")
st.caption("github: https://github.com/Hellisotherpeople/DebateKG")


form = st.sidebar.form("Main Settings")
form.header("Main Settings")
highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 0.05)
show_extract = form.checkbox("Show extracts", value = True)
show_abstract = form.checkbox("Show abstract", value = False)
show_full_doc = form.checkbox("Show full doc", value = False)
show_citation = form.checkbox("Show citation", value = True)
rerank_word = form.text_input("(Optional) Constrain all evidence in the case to have this word within its text", value = "")
form.caption("Doing this may create graphs which are so constrained that DebateKG can't find a valid path in the graph to build a case")
html_window_width = form.number_input("Enter the pixel width of the output debate case window", value = 1000)
html_window_height = form.number_input("Enter the pixel height of the output debate case window", value = 1000)
option = form.selectbox(
    'Which Knowledge Graph do you want to use?',
    ('DebateSum_SemanticGraph_longformer_extract.tar.gz', 'DebateSum_SemanticGraph_longformer_abstract.tar.gz', 'DebateSum_SemanticGraph_mpnet_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_sentence.tar.gz'), index = 2)

form.form_submit_button("Change Settings")

@st.cache(allow_output_mutation=True)
def load_my_dataset():
  dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
  return dataset

@st.cache(allow_output_mutation=True)
def load_embeddings():
  embeddings = Embeddings({
    "path": "sentence-transformers/all-mpnet-base-v2",
    "content": True,
    "functions": [
      {"name": "graph", "function": "graph.attribute"},
    ],
    "expressions": [
        {"name": "topic", "expression": "graph(indexid, 'topic')"},
        {"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
    ],
    "graph": {
        "limit": 100,
        "minscore": 0.10,
        "topics": {
            "terms": 4,
            "resolution" : 100
        }
    }
  })
  embeddings.load(option)
  return embeddings

dataset = load_my_dataset()
embeddings = load_embeddings()

graph = embeddings.graph

def david_distance(source, target, attrs):
    distance = max(1.0 - attrs["weight"], 0.0)
    return distance if distance >= 0.15 else 1.00

def david_showpath(source, target, the_graph):
    return nx.shortest_path(the_graph, source, target, david_distance)

def david_show_all_paths(source, target, the_graph):
    return nx.all_shortest_paths(the_graph, source, target, david_distance)


def highlight(index, result):
  output = f"{index}. "
  spans = [(token, score, "#fff59d" if score > highlight_threshold else None) for token, score in result["tokens"]]

  for token, _, color in spans:
    output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "

  return output



def showpath_any(list_of_arguments, strip_punctuation = True, the_graph=graph.backend):
  list_of_paths = []
  for x, y in zip(list_of_arguments, list_of_arguments[1:]):
    a_path = david_showpath(x, y, the_graph)
    list_of_paths.extend(a_path)
  #print(list_of_paths)
  path = [graph.attribute(p, "text") for p in list_of_paths]
  list_of_evidence_ids = []
  for text in path:
    if strip_punctuation:
        text = text.translate(str.maketrans("","", string.punctuation))
    list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))

  sections = []
  #sections.append(list_of_evidence_ids)
  for x, p in enumerate(path):
      if x == 0:
          # Print start node
          
          sections.append(f"{x + 1}. {p}")
          if show_abstract:
            sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
          if show_citation:
            sections.append(dataset["Citation"][list_of_evidence_ids[x]])
          if show_extract:
            sections.append(dataset["Extract"][list_of_evidence_ids[x]])
          if show_full_doc:
            sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])

      if x < len(path) - 1:
          # Explain and highlight next path element
          results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
          sections.append(highlight(x + 2, results))
          if show_abstract:
            sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
          if show_citation:
            sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
          if show_extract:
            sections.append(dataset["Extract"][list_of_evidence_ids[x+1]])
          if show_full_doc:
            sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])

  return components.html("<br/><br/>".join(sections), scrolling = True, width = html_window_width, height = html_window_height)

def question(text, rerank_word = "", rerank_topic = "", limit = 100):
  return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")



query_form =  st.form("Query the Index:")
query_form.write("Step 1: Find Arguments")
query_form.write("Use semantic SQL from txtai to find some arguments, we use indexids to keep track of them.")
query_form.caption("You can use the semantic SQL to explore the dataset too! The possibilities are limitless!")
query_sql = query_form.text_area("Enter a semantic SQL statement", value = f"select topic, * from txtai where similar('Trump and US relations with China') and topic like '%trump%' and text like '%china%' limit 1")

query_form_submitted = query_form.form_submit_button("Query")

if query_form_submitted:
  with st.expander("Output (Open Me)", expanded = False):
    #my_path = showpath_any([170750, 50, 23])
    #st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))
    st.write(embeddings.search(query_sql))


paths_form = st.form("Build the Arguments")
paths_form.write("Step 2: Build a Policy Debate Case")
paths_form.write("Enter any number of indexids (arguments), DebateKG will build a debate case out of it which links them all together")
user_paths_string = paths_form.text_area("Enter a list of indexids seperated by whitespace", value = "250 10000 2405")
user_paths_list_of_strings = user_paths_string.split()
user_paths_list = list(map(int, user_paths_list_of_strings)) 

paths_form_submitted = paths_form.form_submit_button("Build a Policy Debate Case")

if paths_form_submitted:
  if rerank_word:
    selected_nodes = [n for n,v in graph.backend.nodes(data=True) if rerank_word in v['text']] ##also works for topic
    H = graph.backend.subgraph(selected_nodes)
    showpath_any(user_paths_list, the_graph = H)
  else:
    showpath_any(user_paths_list)