from __future__ import annotations import numpy as np import pandas as pd import requests from huggingface_hub.hf_api import SpaceInfo url = 'https://docs.google.com/spreadsheets/d/1RoM2DgzaYJg6Ias1YNC2kQN01xSWJb1KEER9efb0X7A/edit#gid=0' csv_url = url.replace('/edit#gid=', '/export?format=csv&gid=') class DatasetList: def __init__(self): self.table = pd.read_csv(csv_url) self._preprocess_table() self.table_header = ''' Dataset Name Question Type Count Paper Lincense Access link on 🤗 Brief Description Use Cases ''' def _preprocess_table(self) -> None: self.table['dataset_name_lowercase'] = self.table.dataset_name.str.lower() self.table['count'] = self.table['count'].apply(str) rows = [] for row in self.table.itertuples(): dataset_name = f'{row.dataset_name}' if isinstance(row.dataset_name, str) else '' question_type = f'{row.question_type}' if isinstance(row.question_type, str) else '' count = f'{row.count}' if isinstance(row.count, str) else '' reference_paper = f'Link' if isinstance(row.reference_paper, str) else '' lincense = f'Link' if isinstance(row.lincense, str) else '' huggingface_link = f'HF Link' if isinstance(row.huggingface_link, str) else '' brief_description = f'{row.brief_description}' if isinstance(row.brief_description, str) else '' use_case = f'{row.use_case}' if isinstance(row.use_case, str) else '' row = f''' {dataset_name} {question_type} {count} {reference_paper} {lincense} {huggingface_link} {brief_description} {use_case} ''' rows.append(row) self.table['html_table_content'] = rows def render(self, search_query: str, case_sensitive: bool, filter_names: list[str] ) -> tuple[int, str]: df = self.table if search_query: if case_sensitive: df = df[df.dataset_name.str.contains(search_query)] else: df = df[df.dataset_name_lowercase.str.contains(search_query.lower())] has_datalink = 'Data Link' in filter_names has_paper = 'Paper' in filter_names df = self.filter_table(df, has_datalink, has_paper) return len(df), self.to_html(df, self.table_header) @staticmethod def filter_table(df: pd.DataFrame, has_datalink: bool, has_paper: bool ) -> pd.DataFrame: if has_datalink: df = df[~df.huggingface_link.isna()] if has_paper: df = df[~df.reference_paper.isna()] return df @staticmethod def to_html(df: pd.DataFrame, table_header: str) -> str: table_data = ''.join(df.html_table_content) html = f''' {table_header} {table_data}
''' return html