elibrowne commited on
Commit
e65ac7c
1 Parent(s): 1991ca2

Question data for E5 and ColBERT online and formatted

Browse files
Files changed (3) hide show
  1. create_json_data.py +37 -0
  2. question_data.csv +0 -0
  3. question_data.json +0 -0
create_json_data.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+
4
+ with open("question_data.csv", "r") as f:
5
+ reader = csv.reader(f)
6
+ questions = []
7
+ for row in reader:
8
+ questions.append(row)
9
+
10
+ questions = questions[2:] # cut off top two (labels, passage #s)
11
+
12
+ # qid,prompt,question,a,b,c,d,answer,gold_passage,top10_colbert,,,,,,,,,,generation_colbert,top10_e5,,,,,,,,,,generation_e5,gold_passage_generation
13
+ # 0 1 2 3 4 5 6 7 8 9-18 19 20-29 30 31
14
+ # See example.json for how these files will be ported
15
+
16
+ full_question_dict = {} # stores all "id":q_data pairs
17
+ for entry in questions:
18
+ # Create individual question data
19
+ q_data = {}
20
+ if not entry[1] == "":
21
+ entry[2] = entry[1] + " " + entry[2]
22
+ q_data["question"] = entry[2]
23
+ q_data["answers"] = entry[3:7] # inclusive of (3, 6) -> A, B, C, D
24
+ answer_map = {"A": 0, "B": 1, "C": 2, "D": 3}
25
+ q_data["correct_answer_index"] = answer_map[entry[7]] # entry[7] = "A" -> index = 0
26
+ q_data["top10_colbert"] = entry[9:19] # inclusive of (9-18) -> 10 retrievals
27
+ q_data["generation_colbert"] = entry[19]
28
+ q_data["top10_e5"] = entry[20:30] # inclusive of (20-29) -> 10 retrievals
29
+ q_data["generation_e5"] = entry[30]
30
+ q_data["top10_contains_gold_passage"] = False # this is always the case b/c of programming. Does not reflect reality
31
+ q_data["gold_passage"] = entry[8]
32
+ q_data["gold_passage_generation"] = entry[31]
33
+ # Add to full question dictionary
34
+ full_question_dict[entry[0]] = q_data # entry[0] is qid
35
+
36
+ with open("question_data.json", "w") as f:
37
+ json.dump(full_question_dict, f)
question_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
question_data.json ADDED
The diff for this file is too large to render. See raw diff