Malikeh Ehghaghi commited on
Commit
c02fbf1
1 Parent(s): bd757bc

Add files via upload

Browse files
Files changed (4) hide show
  1. app.py +109 -0
  2. dataset_list.py +101 -0
  3. requirements.txt +4 -0
  4. style.css +18 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ from __future__ import annotations
4
+
5
+ import gradio as gr
6
+
7
+ from dataset_list import DatasetList
8
+
9
+ DESCRIPTION = '# Explore Medical Question Answering Datasets 🏥'
10
+ NOTES = '''
11
+ '''
12
+ FOOTER = ''''''
13
+
14
+ def main():
15
+ dataset_list = DatasetList()
16
+
17
+ with gr.Blocks(css='style.css') as demo:
18
+ gr.Markdown(DESCRIPTION)
19
+
20
+ search_box = gr.Textbox(
21
+ label='Search Dataset Name',
22
+ placeholder=
23
+ 'You can search for titles with regular expressions. e.g. (?<!sur)face',
24
+ max_lines=1)
25
+
26
+ case_sensitive = gr.Checkbox(label='Case Sensitive')
27
+
28
+ filter_names = gr.CheckboxGroup(choices=[
29
+ 'Dataset',
30
+ 'Data Link',
31
+ 'Paper',
32
+ ], label='Filter')
33
+
34
+ # data_type_names = [
35
+ # 'DNA', 'scRNA', 'mRNA', 'scRNA perturbation', 'RNA structure prediction', 'RNA language model', 'protein language model', 'protein structure prediction',
36
+ # 'protein generation', 'protein function prediction', 'protein fitness prediction', 'antibody structure prediction', 'antibody language model', 'molecules',
37
+ # 'ligand generation', 'reaction-to-enzyme', 'enzyme generation', 'epigenomic', 'molecular docking', 'peptide property prediction',
38
+ # ]
39
+
40
+ # data_types = gr.CheckboxGroup(choices=data_type_names,
41
+ # value=data_type_names,
42
+ # label='Type')
43
+
44
+ # years = ['2020', '2021', '2022', '2023']
45
+
46
+ # years_checkbox = gr.CheckboxGroup(choices=years, value=years, label='Year of Publication/Preprint')
47
+
48
+ # model_type_names = [
49
+ # 'GPT2', 'GPT-Neo', 'GPT-NeoX', 'ESM', 'BERT', 'RoBERTa', 'BART', 'T5', 'MPNN', 'diffusion', 'custom model'
50
+ # ]
51
+
52
+ # model_types = gr.CheckboxGroup(choices=model_type_names,
53
+ # value=model_type_names,
54
+ # label='Base Model')
55
+
56
+ search_button = gr.Button('Search')
57
+
58
+ number_of_datasets = gr.Textbox(label='Number of Datasets Found')
59
+ table = gr.HTML(show_label=False)
60
+
61
+ gr.Markdown(NOTES)
62
+ gr.Markdown(FOOTER)
63
+
64
+ demo.load(fn=dataset_list.render,
65
+ inputs=[
66
+ search_box,
67
+ case_sensitive,
68
+ filter_names
69
+ # data_types,
70
+ # years_checkbox,
71
+ #model_types
72
+ ],
73
+ outputs=[
74
+ number_of_datasets,
75
+ table,
76
+ ])
77
+ search_box.submit(fn=dataset_list.render,
78
+ inputs=[
79
+ search_box,
80
+ case_sensitive,
81
+ filter_names
82
+ # data_types,
83
+ # years_checkbox,
84
+ #model_types
85
+ ],
86
+ outputs=[
87
+ number_of_datasets,
88
+ table,
89
+ ])
90
+
91
+ search_button.click(fn=dataset_list.render,
92
+ inputs=[
93
+ search_box,
94
+ case_sensitive,
95
+ filter_names
96
+ # data_types,
97
+ # years_checkbox,
98
+ #model_types
99
+ ],
100
+ outputs=[
101
+ number_of_datasets,
102
+ table,
103
+ ])
104
+ demo.launch(enable_queue=True, share=False)
105
+
106
+
107
+
108
+ if __name__ == '__main__':
109
+ main()
dataset_list.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import requests
6
+ from huggingface_hub.hf_api import SpaceInfo
7
+
8
+ url = 'https://docs.google.com/spreadsheets/d/1RoM2DgzaYJg6Ias1YNC2kQN01xSWJb1KEER9efb0X7A/edit#gid=0'
9
+ csv_url = url.replace('/edit#gid=', '/export?format=csv&gid=')
10
+
11
+ class DatasetList:
12
+ def __init__(self):
13
+ self.table = pd.read_csv(csv_url)
14
+ self._preprocess_table()
15
+
16
+ self.table_header = '''
17
+ <tr>
18
+ <td width="15%">Dataset Name</td>
19
+ <td width="10%">Question Type</td>
20
+ <td width="10%">Applied In Paper</td>
21
+ <td width="10%">Reference Paper</td>
22
+ <td width="20%">Brief Description</td>
23
+ <td width="5%">Count</td>
24
+ <td width="10%">Original Access Link</td>
25
+ <td width="10%">Publicly Available?</td>
26
+ <td width="10%">Access link on 🤗</td>
27
+ </tr>'''
28
+
29
+ def _preprocess_table(self) -> None:
30
+ self.table['dataset_name_lowercase'] = self.table.dataset_name.str.lower()
31
+ self.table['count'] = self.table['count'].apply(str)
32
+
33
+ rows = []
34
+ for row in self.table.itertuples():
35
+ dataset_name = f'{row.dataset_name}' if isinstance(row.dataset_name, str) else ''
36
+ question_type = f'{row.question_type}' if isinstance(row.question_type, str) else ''
37
+ used_in_paper = f'{row.used_in_paper}' if isinstance(row.used_in_paper, str) else ''
38
+ reference_paper = f'<a href="{row.reference_paper}" target="_blank">Paper</a>' if isinstance(row.reference_paper, str) else ''
39
+ brief_description = f'{row.brief_description}' if isinstance(row.brief_description, str) else ''
40
+ count = f'{row.count}' if isinstance(row.count, str) else ''
41
+ original_link = f'<a href="{row.original_link}" target="_blank">Access Link</a>' if isinstance(row.original_link, str) else ''
42
+ publicly_available = f'<a href="{row.publicly_available}" target="_blank">License</a>' if isinstance(row.publicly_available, str) else ''
43
+ huggingface_link = f'<a href="{row.huggingface_link}" target="_blank">HF Link</a>' if isinstance(row.huggingface_link, str) else ''
44
+ row = f'''
45
+ <tr>
46
+ <td>{dataset_name}</td>
47
+ <td>{question_type}</td>
48
+ <td>{used_in_paper}</td>
49
+ <td>{reference_paper}</td>
50
+ <td>{brief_description}</td>
51
+ <td>{count}</td>
52
+ <td>{original_link}</td>
53
+ <td>{publicly_available}</td>
54
+ <td>{huggingface_link}</td>
55
+ </tr>'''
56
+ rows.append(row)
57
+ self.table['html_table_content'] = rows
58
+
59
+ def render(self, search_query: str,
60
+ case_sensitive: bool,
61
+ filter_names: list[str]
62
+ ) -> tuple[int, str]:
63
+ df = self.table
64
+ if search_query:
65
+ if case_sensitive:
66
+ df = df[df.dataset_name.str.contains(search_query)]
67
+ else:
68
+ df = df[df.dataset_name_lowercase.str.contains(search_query.lower())]
69
+ has_dataset = 'Dataset' in filter_names
70
+ has_datalink = 'Data Link' in filter_names
71
+ has_paper = 'Paper' in filter_names
72
+ df = self.filter_table(df, has_dataset, has_datalink, has_paper)
73
+ #df = self.filter_table(df, has_paper, has_github, has_model, data_types, model_types)
74
+ return len(df), self.to_html(df, self.table_header)
75
+
76
+ @staticmethod
77
+ def filter_table(df: pd.DataFrame,
78
+ has_dataset: bool,
79
+ has_datalink: bool,
80
+ has_paper: bool
81
+ ) -> pd.DataFrame:
82
+ if has_dataset:
83
+ df = df[~df.dataset_name.isna()]
84
+ if has_datalink:
85
+ df = df[~df.huggingface_link.isna() | ~df.original_link.isna()]
86
+ if has_paper:
87
+ df = df[~df.reference_paper.isna()]
88
+ # df = df[df.data_type.isin(set(data_types))]
89
+ #df = df[df.base_model.isin(set(model_types))]
90
+ # df = df[df.year.isin(set(years))]
91
+ return df
92
+
93
+ @staticmethod
94
+ def to_html(df: pd.DataFrame, table_header: str) -> str:
95
+ table_data = ''.join(df.html_table_content)
96
+ html = f'''
97
+ <table>
98
+ {table_header}
99
+ {table_data}
100
+ </table>'''
101
+ return html
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ gradio
3
+ numpy
4
+ pandas
style.css ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+ table a {
5
+ background-color: transparent;
6
+ color: #58a6ff;
7
+ text-decoration: none;
8
+ }
9
+ a:active,
10
+ a:hover {
11
+ outline-width: 0;
12
+ }
13
+ a:hover {
14
+ text-decoration: underline;
15
+ }
16
+ table, th, td {
17
+ border: 1px solid;
18
+ }