Spaces:

cmcmaster
/

pbs_biologics_helper

Running

File size: 20,066 Bytes

import datetime
import requests
import csv
from io import StringIO
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from datasets import Dataset

class PBSPublicDataAPIClient:
    def __init__(self, subscription_key, base_url='https://data-api.health.gov.au/pbs/api/v3', rate_limit=0.2):
        self.subscription_key = subscription_key
        self.base_url = base_url
        self.rate_limit = rate_limit  # Requests per second
        self.last_request_time = 0
        
        # Set up a session with retry strategy
        self.session = requests.Session()
        retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
        self.session.mount('https://', HTTPAdapter(max_retries=retries))

    def get_sample_data(self, endpoint, limit=5):
        params = {"limit": limit}
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def fetch_sample_data(self):
        schedules = self.get_schedules()
        latest_schedule = schedules[0]['schedule_code']

        endpoints = [
            "amt-items",
            "atc-codes",
            "indications",
            "prescribing-texts",
            "item-prescribing-text-relationships",
            "restrictions",
            "item-restriction-relationships"
        ]

        sample_data = {}
        for endpoint in endpoints:
            print(f"Fetching sample data from /{endpoint}...")
            data = self.get_sample_data(endpoint)
            if data:
                sample_data[endpoint] = data
                print(f"Sample keys for {endpoint}: {data[0].keys()}")
            else:
                print(f"No data found for {endpoint}")
            time.sleep(2)  # Wait 2 seconds between requests to avoid rate limiting

        return sample_data

    def get_raw_data(self, endpoint, params=None, accept="application/json"):
        response = self.make_request(endpoint, params=params, accept=accept)
        return response.text

    def make_request(self, endpoint, params=None, accept="application/json"):
        url = f"{self.base_url}/{endpoint}"
        headers = {
            "subscription-key": self.subscription_key,
            "Accept": accept
        }

        while True:
            current_time = time.time()
            time_since_last_request = current_time - self.last_request_time
            if time_since_last_request < 1 / self.rate_limit:
                sleep_time = (1 / self.rate_limit) - time_since_last_request
                time.sleep(sleep_time)

            try:
                response = self.session.get(url, headers=headers, params=params)
                self.last_request_time = time.time()

                if response.status_code == 429:
                    retry_after = int(response.headers.get('Retry-After', 60))
                    print(f"Rate limit exceeded. Waiting for {retry_after} seconds.")
                    time.sleep(retry_after)
                    continue

                response.raise_for_status()
                return response

            except requests.exceptions.RequestException as e:
                print(f"Request failed: {str(e)}. Retrying in 5 seconds...")
                time.sleep(5)

    def get_schedules(self, limit=100):
        endpoint = "schedules"
        params = {"limit": limit}
        response = self.make_request(endpoint, params=params)
        json_data = response.json()
        return json_data['data']

    def get_amt_items(self, schedule_code, limit=100000):
        endpoint = "amt-items"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_atc_codes(self, schedule_code, limit=100000):
        endpoint = "atc-codes"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_indications(self, schedule_code, limit=100000):
        endpoint = "indications"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_prescribing_texts(self, schedule_code, limit=100000):
        endpoint = "prescribing-texts"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_item_prescribing_text_relationships(self, schedule_code, limit=100000):
        endpoint = "item-prescribing-text-relationships"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_restrictions(self, schedule_code, limit=100000):
        endpoint = "restrictions"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_item_restriction_relationships(self, schedule_code, limit=100000):
        endpoint = "item-restriction-relationships"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_restriction_prescribing_text_relationships(self, schedule_code, limit=100000):
        endpoint = "restriction-prescribing-text-relationships"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def get_items(self, schedule_code, limit=100000):
        endpoint = "items"
        params = {
            "schedule_code": schedule_code,
            "limit": limit
        }
        response = self.make_request(endpoint, params=params, accept="text/csv")
        csv_content = StringIO(response.text)
        return list(csv.DictReader(csv_content))

    def fetch_rheumatology_biologics_data(self):
        biologics = [
            "adalimumab", "etanercept", "infliximab", "certolizumab", "golimumab",
            "rituximab", "abatacept", "tocilizumab", "secukinumab", "ixekizumab",
            "ustekinumab", "guselkumab", "tofacitinib", "baricitinib", "secukinumab",
            "upadacitinib", "anifrolumab"
        ]

        rheumatic_diseases = [
            "rheumatoid arthritis", "psoriatic arthritis", "ankylosing spondylitis",
            "non-radiographic axial spondyloarthritis", "giant cell arteritis", 
            "juvenile idiopathic arthritis", "systemic lupus erythematosus"
        ]

        data = {}
        schedules = self.get_schedules()

        # Select schedule based on current month
        current_date = datetime.datetime.now()
        current_schedule = next(
            (s for s in schedules if s['effective_year'] == current_date.year and s['effective_month'] == current_date.strftime('%B').upper()),
            schedules[0]  # fallback to the most recent schedule if no match
        )
        latest_schedule = current_schedule['schedule_code']
        schedule_year = current_schedule['effective_year']
        schedule_month = current_schedule['effective_month']

        print(f"Selected schedule: {latest_schedule} (Effective: {current_schedule['effective_date']})")

        print("Fetching items...")
        items = self.get_items(latest_schedule)
        time.sleep(5)

        print("Fetching indications...")
        indications = self.get_indications(latest_schedule)
        print(f"Number of indications fetched: {len(indications)}")
        print("Sample of raw indications data:")
        for indication in indications[:5]:
            print(indication)
        time.sleep(5)

        print("Fetching prescribing texts...")
        prescribing_texts = self.get_prescribing_texts(latest_schedule)
        time.sleep(5)

        print("Fetching item-prescribing-text relationships...")
        item_prescribing_text_relationships = self.get_item_prescribing_text_relationships(latest_schedule)
        time.sleep(5)

        print("Fetching restrictions...")
        restrictions = self.get_restrictions(latest_schedule)
        time.sleep(5)

        print("Fetching item-restriction relationships...")
        item_restriction_relationships = self.get_item_restriction_relationships(latest_schedule)

        print("Fetching restriction-prescribing-text relationships...")
        restriction_prescribing_text_relationships = self.get_restriction_prescribing_text_relationships(latest_schedule)
        print(f"Number of restriction-prescribing-text relationships fetched: {len(restriction_prescribing_text_relationships)}")
        time.sleep(5)

        # Create lookup dictionaries
        prescribing_text_lookup = {text['prescribing_txt_id']: text for text in prescribing_texts if 'prescribing_txt_id' in text}
        restriction_lookup = {res['res_code']: res for res in restrictions if 'res_code' in res}

        # Create indication lookup
        indication_lookup = {}
        for ind in indications:
            # Print all keys in the first indication to see available fields
            if not indication_lookup:
                print("Keys in indication data:", ind.keys())
            
            # Try different possible keys for the prescribing text ID
            prescribing_text_id = ind.get('prescribing_text_id') or ind.get('indication_prescribing_txt_id') or ind.get('prescribing_txt_id')
            if prescribing_text_id:
                indication_lookup[prescribing_text_id] = ind

        print(f"Number of items in indication_lookup: {len(indication_lookup)}")
        print("Sample of indication_lookup:")
        for key, value in list(indication_lookup.items())[:5]:
            print(f"  {key}: {value}")

        # Create a lookup for item-prescribing-text relationships
        item_prescribing_text_lookup = {}
        for relationship in item_prescribing_text_relationships:
            pbs_code = relationship.get('pbs_code')
            prescribing_txt_id = relationship.get('prescribing_txt_id')
            if pbs_code and prescribing_txt_id:
                if pbs_code not in item_prescribing_text_lookup:
                    item_prescribing_text_lookup[pbs_code] = []
                item_prescribing_text_lookup[pbs_code].append(prescribing_txt_id)

        # Create a lookup for restriction-prescribing-text relationships
        restriction_prescribing_text_lookup = {}
        print("\nDebugging restriction-prescribing-text relationships:")
        print("Full structure of first 5 relationships:")
        for relationship in restriction_prescribing_text_relationships[:5]:
            print(relationship)

        for relationship in restriction_prescribing_text_relationships:
            res_code = relationship.get('res_code')
            prescribing_text_id = relationship.get('prescribing_text_id')
            if res_code and prescribing_text_id:
                if res_code not in restriction_prescribing_text_lookup:
                    restriction_prescribing_text_lookup[res_code] = []
                restriction_prescribing_text_lookup[res_code].append(prescribing_text_id)

        print(f"Number of items in restriction_prescribing_text_lookup: {len(restriction_prescribing_text_lookup)}")
        print("Sample of restriction_prescribing_text_lookup:")
        for key, value in list(restriction_prescribing_text_lookup.items())[:5]:
            print(f"  {key}: {value}")

        print("Debugging: Inspecting lookups")
        print(f"Number of items in prescribing_text_lookup: {len(prescribing_text_lookup)}")
        print(f"Number of items in restriction_lookup: {len(restriction_lookup)}")
        print(f"Number of items in indication_lookup: {len(indication_lookup)}")
        print(f"Number of items in item_prescribing_text_lookup: {len(item_prescribing_text_lookup)}")
        print(f"Number of items in restriction_prescribing_text_lookup: {len(restriction_prescribing_text_lookup)}")

        def classify_formulation(description):
            # Define keywords for each formulation type
            tablet_keywords = ['Tablet']
            pen_keywords = ['pen', 'auto-injector', 'autoinjector']
            syringe_keywords = ['syringe']
            infusion_keywords = ['I.V. infusion', 'Concentrate for injection']

            # Normalize the description to lowercase for case-insensitive matching
            desc_lower = description.lower()

            # Check for keywords and return the corresponding formulation type
            if any(keyword.lower() in desc_lower for keyword in tablet_keywords):
                return 'tablet'
            elif any(keyword.lower() in desc_lower for keyword in pen_keywords):
                return 'subcut pen'
            elif any(keyword.lower() in desc_lower for keyword in syringe_keywords):
                return 'subcut syringe'
            elif any(keyword.lower() in desc_lower for keyword in infusion_keywords):
                return 'infusion'
            else:
                return 'unknown'  # For cases that don't match any category

        def classify_hospital_type(program_code):
            if program_code == 'HS':
                return 'Private'
            elif program_code == 'HB':
                return 'Public'
            else:
                return 'Any'

        for item in items:
            if any(biologic.lower() in item['drug_name'].lower() for biologic in biologics):
                pbs_code = item['pbs_code']
                if pbs_code not in data:
                    data[pbs_code] = {
                        "schedule_code": latest_schedule,
                        "schedule_year": schedule_year,
                        "schedule_month": schedule_month,
                        "name": item['drug_name'],
                        "brands": [],  # Change this to a list
                        "formulation": classify_formulation(item['li_form']),
                        "li_form": item['li_form'],
                        "schedule_form": item['schedule_form'],
                        "manner_of_administration": item['manner_of_administration'],
                        "maximum_quantity": item['maximum_quantity_units'],
                        "number_of_repeats": item['number_of_repeats'],
                        "hospital_type": classify_hospital_type(item['program_code']),
                        "restrictions": []
                    }
                # Append the brand name if it's not already in the list
                if item['brand_name'] not in data[pbs_code]['brands']:
                    data[pbs_code]['brands'].append(item['brand_name'])

        for pbs_code in list(data.keys()):
            for relationship in item_restriction_relationships:
                if relationship.get('pbs_code') == pbs_code:
                    res_code = relationship.get('res_code')
                    restriction = restriction_lookup.get(res_code)
                    if restriction:
                        prescribing_text_ids = restriction_prescribing_text_lookup.get(res_code, [])
                        for prescribing_text_id in prescribing_text_ids:
                            indication = indication_lookup.get(prescribing_text_id)
                            if indication:
                                condition = indication.get('condition', '').lower()
                                found_indication = next((disease for disease in rheumatic_diseases if disease.lower() in condition), None)
                                if found_indication:
                                    restriction_data = {
                                        'res_code': res_code,
                                        'indications': found_indication,
                                        'treatment_phase': restriction.get('treatment_phase', ''),
                                        'restriction_text': restriction.get('li_html_text', ''),
                                        'authority_method': restriction.get('authority_method', ''),
                                        'streamlined_code': restriction.get('treatment_of_code') if restriction.get('authority_method') == "STREAMLINED" else None,
                                        'online_application': "HOBART TAS 7001" not in restriction.get('schedule_html_text', '')
                                    }
                                    data[pbs_code]['restrictions'].append(restriction_data)
                                    break  # Stop after finding the first matching indication

        # Drop entries if restrictions are empty
        data = {k: v for k, v in data.items() if v['restrictions']}
        return data

    def preprocess_data(self, data):
        processed = {
            'combinations': []
        }
        
        for pbs_code, item in data.items():
            for restriction in item['restrictions']:
                for brand in item['brands']:
                    processed['combinations'].append({
                        'pbs_code': pbs_code,
                        'drug': item['name'],
                        'brand': brand,
                        'formulation': item['li_form'],
                        'indication': restriction['indications'],
                        'treatment_phase': restriction['treatment_phase'],
                        'streamlined_code': restriction['streamlined_code'],
                        'online_application': restriction['online_application'],
                        'authority_method': restriction['authority_method'],
                        'hospital_type': item['hospital_type'],
                        'schedule_code': item['schedule_code'],
                        'schedule_year': item['schedule_year'],
                        'schedule_month': item['schedule_month']
                    })
        
        return processed

    def save_data_to_hf(self, data, hf_token, dataset_name="cmcmaster/rheumatology-biologics-dataset"):
        processed_data = self.preprocess_data(data)
        
        # Create a Dataset from the combinations
        dataset = Dataset.from_list(processed_data['combinations'])
        
        # Push the dataset to the Hugging Face Hub
        dataset.push_to_hub(dataset_name, token=hf_token)
        
        print(f"Data saved to Hugging Face Hub: {dataset_name}")

def main():
    client = PBSPublicDataAPIClient("2384af7c667342ceb5a736fe29f1dc6b", rate_limit=0.2)

    try:
        print("Fetching data on biologics used for rheumatological diseases...")
        data = client.fetch_rheumatology_biologics_data()
        
        print(f"Data fetched for {len(data)} items.")
        
        client.save_data_to_hf(data)
        print("Data saved to Hugging Face Hub")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()