import datetime import requests import csv from io import StringIO import time from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from datasets import Dataset class PBSPublicDataAPIClient: def __init__(self, subscription_key, base_url='https://data-api.health.gov.au/pbs/api/v3', rate_limit=0.2): self.subscription_key = subscription_key self.base_url = base_url self.rate_limit = rate_limit # Requests per second self.last_request_time = 0 # Set up a session with retry strategy self.session = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) self.session.mount('https://', HTTPAdapter(max_retries=retries)) def get_sample_data(self, endpoint, limit=5): params = {"limit": limit} response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def fetch_sample_data(self): schedules = self.get_schedules() latest_schedule = schedules[0]['schedule_code'] endpoints = [ "amt-items", "atc-codes", "indications", "prescribing-texts", "item-prescribing-text-relationships", "restrictions", "item-restriction-relationships" ] sample_data = {} for endpoint in endpoints: print(f"Fetching sample data from /{endpoint}...") data = self.get_sample_data(endpoint) if data: sample_data[endpoint] = data print(f"Sample keys for {endpoint}: {data[0].keys()}") else: print(f"No data found for {endpoint}") time.sleep(2) # Wait 2 seconds between requests to avoid rate limiting return sample_data def get_raw_data(self, endpoint, params=None, accept="application/json"): response = self.make_request(endpoint, params=params, accept=accept) return response.text def make_request(self, endpoint, params=None, accept="application/json"): url = f"{self.base_url}/{endpoint}" headers = { "subscription-key": self.subscription_key, "Accept": accept } while True: current_time = time.time() time_since_last_request = current_time - self.last_request_time if time_since_last_request < 1 / self.rate_limit: sleep_time = (1 / self.rate_limit) - time_since_last_request time.sleep(sleep_time) try: response = self.session.get(url, headers=headers, params=params) self.last_request_time = time.time() if response.status_code == 429: retry_after = int(response.headers.get('Retry-After', 60)) print(f"Rate limit exceeded. Waiting for {retry_after} seconds.") time.sleep(retry_after) continue response.raise_for_status() return response except requests.exceptions.RequestException as e: print(f"Request failed: {str(e)}. Retrying in 5 seconds...") time.sleep(5) def get_schedules(self, limit=100): endpoint = "schedules" params = {"limit": limit} response = self.make_request(endpoint, params=params) json_data = response.json() return json_data['data'] def get_amt_items(self, schedule_code, limit=100000): endpoint = "amt-items" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_atc_codes(self, schedule_code, limit=100000): endpoint = "atc-codes" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_indications(self, schedule_code, limit=100000): endpoint = "indications" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_prescribing_texts(self, schedule_code, limit=100000): endpoint = "prescribing-texts" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_item_prescribing_text_relationships(self, schedule_code, limit=100000): endpoint = "item-prescribing-text-relationships" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_restrictions(self, schedule_code, limit=100000): endpoint = "restrictions" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_item_restriction_relationships(self, schedule_code, limit=100000): endpoint = "item-restriction-relationships" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_restriction_prescribing_text_relationships(self, schedule_code, limit=100000): endpoint = "restriction-prescribing-text-relationships" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def get_items(self, schedule_code, limit=100000): endpoint = "items" params = { "schedule_code": schedule_code, "limit": limit } response = self.make_request(endpoint, params=params, accept="text/csv") csv_content = StringIO(response.text) return list(csv.DictReader(csv_content)) def fetch_rheumatology_biologics_data(self): biologics = [ "adalimumab", "etanercept", "infliximab", "certolizumab", "golimumab", "rituximab", "abatacept", "tocilizumab", "secukinumab", "ixekizumab", "ustekinumab", "guselkumab", "tofacitinib", "baricitinib", "secukinumab", "upadacitinib", "anifrolumab" ] rheumatic_diseases = [ "rheumatoid arthritis", "psoriatic arthritis", "ankylosing spondylitis", "non-radiographic axial spondyloarthritis", "giant cell arteritis", "juvenile idiopathic arthritis", "systemic lupus erythematosus" ] data = {} schedules = self.get_schedules() # Select schedule based on current month current_date = datetime.datetime.now() current_schedule = next( (s for s in schedules if s['effective_year'] == current_date.year and s['effective_month'] == current_date.strftime('%B').upper()), schedules[0] # fallback to the most recent schedule if no match ) latest_schedule = current_schedule['schedule_code'] schedule_year = current_schedule['effective_year'] schedule_month = current_schedule['effective_month'] print(f"Selected schedule: {latest_schedule} (Effective: {current_schedule['effective_date']})") print("Fetching items...") items = self.get_items(latest_schedule) time.sleep(5) print("Fetching indications...") indications = self.get_indications(latest_schedule) print(f"Number of indications fetched: {len(indications)}") print("Sample of raw indications data:") for indication in indications[:5]: print(indication) time.sleep(5) print("Fetching prescribing texts...") prescribing_texts = self.get_prescribing_texts(latest_schedule) time.sleep(5) print("Fetching item-prescribing-text relationships...") item_prescribing_text_relationships = self.get_item_prescribing_text_relationships(latest_schedule) time.sleep(5) print("Fetching restrictions...") restrictions = self.get_restrictions(latest_schedule) time.sleep(5) print("Fetching item-restriction relationships...") item_restriction_relationships = self.get_item_restriction_relationships(latest_schedule) print("Fetching restriction-prescribing-text relationships...") restriction_prescribing_text_relationships = self.get_restriction_prescribing_text_relationships(latest_schedule) print(f"Number of restriction-prescribing-text relationships fetched: {len(restriction_prescribing_text_relationships)}") time.sleep(5) # Create lookup dictionaries prescribing_text_lookup = {text['prescribing_txt_id']: text for text in prescribing_texts if 'prescribing_txt_id' in text} restriction_lookup = {res['res_code']: res for res in restrictions if 'res_code' in res} # Create indication lookup indication_lookup = {} for ind in indications: # Print all keys in the first indication to see available fields if not indication_lookup: print("Keys in indication data:", ind.keys()) # Try different possible keys for the prescribing text ID prescribing_text_id = ind.get('prescribing_text_id') or ind.get('indication_prescribing_txt_id') or ind.get('prescribing_txt_id') if prescribing_text_id: indication_lookup[prescribing_text_id] = ind print(f"Number of items in indication_lookup: {len(indication_lookup)}") print("Sample of indication_lookup:") for key, value in list(indication_lookup.items())[:5]: print(f" {key}: {value}") # Create a lookup for item-prescribing-text relationships item_prescribing_text_lookup = {} for relationship in item_prescribing_text_relationships: pbs_code = relationship.get('pbs_code') prescribing_txt_id = relationship.get('prescribing_txt_id') if pbs_code and prescribing_txt_id: if pbs_code not in item_prescribing_text_lookup: item_prescribing_text_lookup[pbs_code] = [] item_prescribing_text_lookup[pbs_code].append(prescribing_txt_id) # Create a lookup for restriction-prescribing-text relationships restriction_prescribing_text_lookup = {} print("\nDebugging restriction-prescribing-text relationships:") print("Full structure of first 5 relationships:") for relationship in restriction_prescribing_text_relationships[:5]: print(relationship) for relationship in restriction_prescribing_text_relationships: res_code = relationship.get('res_code') prescribing_text_id = relationship.get('prescribing_text_id') if res_code and prescribing_text_id: if res_code not in restriction_prescribing_text_lookup: restriction_prescribing_text_lookup[res_code] = [] restriction_prescribing_text_lookup[res_code].append(prescribing_text_id) print(f"Number of items in restriction_prescribing_text_lookup: {len(restriction_prescribing_text_lookup)}") print("Sample of restriction_prescribing_text_lookup:") for key, value in list(restriction_prescribing_text_lookup.items())[:5]: print(f" {key}: {value}") print("Debugging: Inspecting lookups") print(f"Number of items in prescribing_text_lookup: {len(prescribing_text_lookup)}") print(f"Number of items in restriction_lookup: {len(restriction_lookup)}") print(f"Number of items in indication_lookup: {len(indication_lookup)}") print(f"Number of items in item_prescribing_text_lookup: {len(item_prescribing_text_lookup)}") print(f"Number of items in restriction_prescribing_text_lookup: {len(restriction_prescribing_text_lookup)}") def classify_formulation(description): # Define keywords for each formulation type tablet_keywords = ['Tablet'] pen_keywords = ['pen', 'auto-injector', 'autoinjector'] syringe_keywords = ['syringe'] infusion_keywords = ['I.V. infusion', 'Concentrate for injection'] # Normalize the description to lowercase for case-insensitive matching desc_lower = description.lower() # Check for keywords and return the corresponding formulation type if any(keyword.lower() in desc_lower for keyword in tablet_keywords): return 'tablet' elif any(keyword.lower() in desc_lower for keyword in pen_keywords): return 'subcut pen' elif any(keyword.lower() in desc_lower for keyword in syringe_keywords): return 'subcut syringe' elif any(keyword.lower() in desc_lower for keyword in infusion_keywords): return 'infusion' else: return 'unknown' # For cases that don't match any category def classify_hospital_type(program_code): if program_code == 'HS': return 'Private' elif program_code == 'HB': return 'Public' else: return 'Any' for item in items: if any(biologic.lower() in item['drug_name'].lower() for biologic in biologics): pbs_code = item['pbs_code'] if pbs_code not in data: data[pbs_code] = { "schedule_code": latest_schedule, "schedule_year": schedule_year, "schedule_month": schedule_month, "name": item['drug_name'], "brands": [], # Change this to a list "formulation": classify_formulation(item['li_form']), "li_form": item['li_form'], "schedule_form": item['schedule_form'], "manner_of_administration": item['manner_of_administration'], "maximum_quantity": item['maximum_quantity_units'], "number_of_repeats": item['number_of_repeats'], "hospital_type": classify_hospital_type(item['program_code']), "restrictions": [] } # Append the brand name if it's not already in the list if item['brand_name'] not in data[pbs_code]['brands']: data[pbs_code]['brands'].append(item['brand_name']) for pbs_code in list(data.keys()): for relationship in item_restriction_relationships: if relationship.get('pbs_code') == pbs_code: res_code = relationship.get('res_code') restriction = restriction_lookup.get(res_code) if restriction: prescribing_text_ids = restriction_prescribing_text_lookup.get(res_code, []) for prescribing_text_id in prescribing_text_ids: indication = indication_lookup.get(prescribing_text_id) if indication: condition = indication.get('condition', '').lower() found_indication = next((disease for disease in rheumatic_diseases if disease.lower() in condition), None) if found_indication: restriction_data = { 'res_code': res_code, 'indications': found_indication, 'treatment_phase': restriction.get('treatment_phase', ''), 'restriction_text': restriction.get('li_html_text', ''), 'authority_method': restriction.get('authority_method', ''), 'streamlined_code': restriction.get('treatment_of_code') if restriction.get('authority_method') == "STREAMLINED" else None, 'online_application': "HOBART TAS 7001" not in restriction.get('schedule_html_text', '') } data[pbs_code]['restrictions'].append(restriction_data) break # Stop after finding the first matching indication # Drop entries if restrictions are empty data = {k: v for k, v in data.items() if v['restrictions']} return data def preprocess_data(self, data): processed = { 'combinations': [] } for pbs_code, item in data.items(): for restriction in item['restrictions']: for brand in item['brands']: processed['combinations'].append({ 'pbs_code': pbs_code, 'drug': item['name'], 'brand': brand, 'formulation': item['li_form'], 'indication': restriction['indications'], 'treatment_phase': restriction['treatment_phase'], 'streamlined_code': restriction['streamlined_code'], 'online_application': restriction['online_application'], 'authority_method': restriction['authority_method'], 'hospital_type': item['hospital_type'], 'schedule_code': item['schedule_code'], 'schedule_year': item['schedule_year'], 'schedule_month': item['schedule_month'] }) return processed def save_data_to_hf(self, data, hf_token, dataset_name="cmcmaster/rheumatology-biologics-dataset"): processed_data = self.preprocess_data(data) # Create a Dataset from the combinations dataset = Dataset.from_list(processed_data['combinations']) # Push the dataset to the Hugging Face Hub dataset.push_to_hub(dataset_name, token=hf_token) print(f"Data saved to Hugging Face Hub: {dataset_name}") def main(): client = PBSPublicDataAPIClient("2384af7c667342ceb5a736fe29f1dc6b", rate_limit=0.2) try: print("Fetching data on biologics used for rheumatological diseases...") data = client.fetch_rheumatology_biologics_data() print(f"Data fetched for {len(data)} items.") client.save_data_to_hf(data) print("Data saved to Hugging Face Hub") except Exception as e: print(f"An error occurred: {str(e)}") if __name__ == "__main__": main()