cmcmaster commited on
Commit
54de438
1 Parent(s): 13ba67a

Update pbs_data.py

Browse files
Files changed (1) hide show
  1. pbs_data.py +16 -90
pbs_data.py CHANGED
@@ -1,13 +1,11 @@
 
1
  import requests
2
  import csv
3
  from io import StringIO
4
- import json
5
  import time
6
  from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
8
- import sqlite3
9
- import datetime
10
- import os
11
 
12
  class PBSPublicDataAPIClient:
13
  def __init__(self, subscription_key, base_url='https://data-api.health.gov.au/pbs/api/v3', rate_limit=0.2):
@@ -389,30 +387,16 @@ class PBSPublicDataAPIClient:
389
 
390
  def preprocess_data(self, data):
391
  processed = {
392
- 'drugs': set(),
393
- 'brands': set(),
394
- 'formulations': set(),
395
- 'indications': set(),
396
- 'treatment_phases': set(),
397
- 'hospital_types': set(),
398
  'combinations': []
399
  }
400
 
401
  for pbs_code, item in data.items():
402
- processed['drugs'].add(item['name'])
403
- processed['brands'].update(item['brands']) # Update this line
404
- processed['formulations'].add(item['li_form'])
405
- processed['hospital_types'].add(item['hospital_type'])
406
-
407
  for restriction in item['restrictions']:
408
- processed['indications'].add(restriction['indications'])
409
- processed['treatment_phases'].add(restriction['treatment_phase'])
410
-
411
- for brand in item['brands']: # Add this loop
412
  processed['combinations'].append({
413
  'pbs_code': pbs_code,
414
  'drug': item['name'],
415
- 'brand': brand, # Update this line
416
  'formulation': item['li_form'],
417
  'indication': restriction['indications'],
418
  'treatment_phase': restriction['treatment_phase'],
@@ -425,76 +409,18 @@ class PBSPublicDataAPIClient:
425
  'schedule_month': item['schedule_month']
426
  })
427
 
428
- return {k: sorted(v) if isinstance(v, set) else v for k, v in processed.items()}
429
 
430
- def save_data_to_sqlite(self, data, db_path="rheumatology_biologics_data.db"):
431
  processed_data = self.preprocess_data(data)
432
-
433
- # Remove the existing database file if it exists
434
- if os.path.exists(db_path):
435
- os.remove(db_path)
436
-
437
- conn = sqlite3.connect(db_path)
438
- cursor = conn.cursor()
439
-
440
- # Create tables
441
- cursor.execute('''CREATE TABLE IF NOT EXISTS drugs
442
- (id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
443
- cursor.execute('''CREATE TABLE IF NOT EXISTS brands
444
- (id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
445
- cursor.execute('''CREATE TABLE IF NOT EXISTS formulations
446
- (id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
447
- cursor.execute('''CREATE TABLE IF NOT EXISTS indications
448
- (id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
449
- cursor.execute('''CREATE TABLE IF NOT EXISTS treatment_phases
450
- (id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
451
- cursor.execute('''CREATE TABLE IF NOT EXISTS hospital_types
452
- (id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
453
- cursor.execute('''CREATE TABLE IF NOT EXISTS combinations
454
- (id INTEGER PRIMARY KEY, pbs_code TEXT, drug_id INTEGER, brand_id INTEGER,
455
- formulation_id INTEGER, indication_id INTEGER, treatment_phase_id INTEGER,
456
- streamlined_code TEXT, online_application BOOLEAN, authority_method TEXT,
457
- hospital_type_id INTEGER, schedule_code TEXT, schedule_year INTEGER, schedule_month TEXT,
458
- FOREIGN KEY (drug_id) REFERENCES drugs(id),
459
- FOREIGN KEY (brand_id) REFERENCES brands(id),
460
- FOREIGN KEY (formulation_id) REFERENCES formulations(id),
461
- FOREIGN KEY (indication_id) REFERENCES indications(id),
462
- FOREIGN KEY (treatment_phase_id) REFERENCES treatment_phases(id),
463
- FOREIGN KEY (hospital_type_id) REFERENCES hospital_types(id))''')
464
-
465
- # Insert data
466
- for table in ['drugs', 'brands', 'formulations', 'indications', 'treatment_phases', 'hospital_types']:
467
- cursor.executemany(f"INSERT OR IGNORE INTO {table} (name) VALUES (?)",
468
- [(item,) for item in processed_data[table]])
469
-
470
- # Insert combinations
471
- for combo in processed_data['combinations']:
472
- cursor.execute('''INSERT INTO combinations
473
- (pbs_code, drug_id, brand_id, formulation_id, indication_id,
474
- treatment_phase_id, streamlined_code, online_application, authority_method,
475
- hospital_type_id, schedule_code, schedule_year, schedule_month)
476
- VALUES (?,
477
- (SELECT id FROM drugs WHERE name = ?),
478
- (SELECT id FROM brands WHERE name = ?),
479
- (SELECT id FROM formulations WHERE name = ?),
480
- (SELECT id FROM indications WHERE name = ?),
481
- (SELECT id FROM treatment_phases WHERE name = ?),
482
- ?, ?, ?,
483
- (SELECT id FROM hospital_types WHERE name = ?),
484
- ?, ?, ?)''',
485
- (combo['pbs_code'], combo['drug'], combo['brand'], combo['formulation'],
486
- combo['indication'], combo['treatment_phase'], combo['streamlined_code'],
487
- combo['online_application'], combo['authority_method'], combo['hospital_type'],
488
- combo['schedule_code'], combo['schedule_year'], combo['schedule_month']))
489
-
490
- # Add last_updated column and insert timestamp
491
- cursor.execute('''CREATE TABLE IF NOT EXISTS metadata
492
- (key TEXT PRIMARY KEY, value TEXT)''')
493
- cursor.execute('''INSERT OR REPLACE INTO metadata (key, value)
494
- VALUES ('last_updated', ?)''', (datetime.datetime.now().isoformat(),))
495
-
496
- conn.commit()
497
- conn.close()
498
 
499
  def main():
500
  client = PBSPublicDataAPIClient("2384af7c667342ceb5a736fe29f1dc6b", rate_limit=0.2)
@@ -505,8 +431,8 @@ def main():
505
 
506
  print(f"Data fetched for {len(data)} items.")
507
 
508
- client.save_data_to_sqlite(data)
509
- print("Data saved to rheumatology_biologics_data.db")
510
 
511
  except Exception as e:
512
  print(f"An error occurred: {str(e)}")
 
1
+ import datetime
2
  import requests
3
  import csv
4
  from io import StringIO
 
5
  import time
6
  from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
8
+ from datasets import Dataset
 
 
9
 
10
  class PBSPublicDataAPIClient:
11
  def __init__(self, subscription_key, base_url='https://data-api.health.gov.au/pbs/api/v3', rate_limit=0.2):
 
387
 
388
  def preprocess_data(self, data):
389
  processed = {
 
 
 
 
 
 
390
  'combinations': []
391
  }
392
 
393
  for pbs_code, item in data.items():
 
 
 
 
 
394
  for restriction in item['restrictions']:
395
+ for brand in item['brands']:
 
 
 
396
  processed['combinations'].append({
397
  'pbs_code': pbs_code,
398
  'drug': item['name'],
399
+ 'brand': brand,
400
  'formulation': item['li_form'],
401
  'indication': restriction['indications'],
402
  'treatment_phase': restriction['treatment_phase'],
 
409
  'schedule_month': item['schedule_month']
410
  })
411
 
412
+ return processed
413
 
414
+ def save_data_to_hf(self, data, dataset_name="cmcmaster/rheumatology-biologics-dataset"):
415
  processed_data = self.preprocess_data(data)
416
+
417
+ # Create a Dataset from the combinations
418
+ dataset = Dataset.from_list(processed_data['combinations'])
419
+
420
+ # Push the dataset to the Hugging Face Hub
421
+ dataset.push_to_hub(dataset_name)
422
+
423
+ print(f"Data saved to Hugging Face Hub: {dataset_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
  def main():
426
  client = PBSPublicDataAPIClient("2384af7c667342ceb5a736fe29f1dc6b", rate_limit=0.2)
 
431
 
432
  print(f"Data fetched for {len(data)} items.")
433
 
434
+ client.save_data_to_hf(data)
435
+ print("Data saved to Hugging Face Hub")
436
 
437
  except Exception as e:
438
  print(f"An error occurred: {str(e)}")