from __future__ import annotations import numpy as np import pandas as pd import requests from huggingface_hub.hf_api import SpaceInfo url = 'https://docs.google.com/spreadsheets/d/1RoM2DgzaYJg6Ias1YNC2kQN01xSWJb1KEER9efb0X7A/edit#gid=0' csv_url = url.replace('/edit#gid=', '/export?format=csv&gid=') class DatasetList: def __init__(self): self.table = pd.read_csv(csv_url) self._preprocess_table() self.table_header = ''' Dataset Name Question Type Applied In Paper Reference Paper Brief Description Count Original Access Link Publicly Available? Access link on 🤗 ''' def _preprocess_table(self) -> None: self.table['dataset_name_lowercase'] = self.table.dataset_name.str.lower() self.table['count'] = self.table['count'].apply(str) rows = [] for row in self.table.itertuples(): dataset_name = f'{row.dataset_name}' if isinstance(row.dataset_name, str) else '' question_type = f'{row.question_type}' if isinstance(row.question_type, str) else '' used_in_paper = f'{row.used_in_paper}' if isinstance(row.used_in_paper, str) else '' reference_paper = f'Paper' if isinstance(row.reference_paper, str) else '' brief_description = f'{row.brief_description}' if isinstance(row.brief_description, str) else '' count = f'{row.count}' if isinstance(row.count, str) else '' original_link = f'Access Link' if isinstance(row.original_link, str) else '' publicly_available = f'License' if isinstance(row.publicly_available, str) else '' huggingface_link = f'HF Link' if isinstance(row.huggingface_link, str) else '' row = f''' {dataset_name} {question_type} {used_in_paper} {reference_paper} {brief_description} {count} {original_link} {publicly_available} {huggingface_link} ''' rows.append(row) self.table['html_table_content'] = rows def render(self, search_query: str, case_sensitive: bool, filter_names: list[str] ) -> tuple[int, str]: df = self.table if search_query: if case_sensitive: df = df[df.dataset_name.str.contains(search_query)] else: df = df[df.dataset_name_lowercase.str.contains(search_query.lower())] has_dataset = 'Dataset' in filter_names has_datalink = 'Data Link' in filter_names has_paper = 'Paper' in filter_names df = self.filter_table(df, has_dataset, has_datalink, has_paper) #df = self.filter_table(df, has_paper, has_github, has_model, data_types, model_types) return len(df), self.to_html(df, self.table_header) @staticmethod def filter_table(df: pd.DataFrame, has_dataset: bool, has_datalink: bool, has_paper: bool ) -> pd.DataFrame: if has_dataset: df = df[~df.dataset_name.isna()] if has_datalink: df = df[~df.huggingface_link.isna() | ~df.original_link.isna()] if has_paper: df = df[~df.reference_paper.isna()] # df = df[df.data_type.isin(set(data_types))] #df = df[df.base_model.isin(set(model_types))] # df = df[df.year.isin(set(years))] return df @staticmethod def to_html(df: pd.DataFrame, table_header: str) -> str: table_data = ''.join(df.html_table_content) html = f''' {table_header} {table_data}
''' return html