Spaces:
Running
Running
Update pbs_data.py
Browse files- pbs_data.py +16 -90
pbs_data.py
CHANGED
@@ -1,13 +1,11 @@
|
|
|
|
1 |
import requests
|
2 |
import csv
|
3 |
from io import StringIO
|
4 |
-
import json
|
5 |
import time
|
6 |
from requests.adapters import HTTPAdapter
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
-
import
|
9 |
-
import datetime
|
10 |
-
import os
|
11 |
|
12 |
class PBSPublicDataAPIClient:
|
13 |
def __init__(self, subscription_key, base_url='https://data-api.health.gov.au/pbs/api/v3', rate_limit=0.2):
|
@@ -389,30 +387,16 @@ class PBSPublicDataAPIClient:
|
|
389 |
|
390 |
def preprocess_data(self, data):
|
391 |
processed = {
|
392 |
-
'drugs': set(),
|
393 |
-
'brands': set(),
|
394 |
-
'formulations': set(),
|
395 |
-
'indications': set(),
|
396 |
-
'treatment_phases': set(),
|
397 |
-
'hospital_types': set(),
|
398 |
'combinations': []
|
399 |
}
|
400 |
|
401 |
for pbs_code, item in data.items():
|
402 |
-
processed['drugs'].add(item['name'])
|
403 |
-
processed['brands'].update(item['brands']) # Update this line
|
404 |
-
processed['formulations'].add(item['li_form'])
|
405 |
-
processed['hospital_types'].add(item['hospital_type'])
|
406 |
-
|
407 |
for restriction in item['restrictions']:
|
408 |
-
|
409 |
-
processed['treatment_phases'].add(restriction['treatment_phase'])
|
410 |
-
|
411 |
-
for brand in item['brands']: # Add this loop
|
412 |
processed['combinations'].append({
|
413 |
'pbs_code': pbs_code,
|
414 |
'drug': item['name'],
|
415 |
-
'brand': brand,
|
416 |
'formulation': item['li_form'],
|
417 |
'indication': restriction['indications'],
|
418 |
'treatment_phase': restriction['treatment_phase'],
|
@@ -425,76 +409,18 @@ class PBSPublicDataAPIClient:
|
|
425 |
'schedule_month': item['schedule_month']
|
426 |
})
|
427 |
|
428 |
-
return
|
429 |
|
430 |
-
def
|
431 |
processed_data = self.preprocess_data(data)
|
432 |
-
|
433 |
-
#
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
# Create tables
|
441 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS drugs
|
442 |
-
(id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
|
443 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS brands
|
444 |
-
(id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
|
445 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS formulations
|
446 |
-
(id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
|
447 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS indications
|
448 |
-
(id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
|
449 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS treatment_phases
|
450 |
-
(id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
|
451 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS hospital_types
|
452 |
-
(id INTEGER PRIMARY KEY, name TEXT UNIQUE)''')
|
453 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS combinations
|
454 |
-
(id INTEGER PRIMARY KEY, pbs_code TEXT, drug_id INTEGER, brand_id INTEGER,
|
455 |
-
formulation_id INTEGER, indication_id INTEGER, treatment_phase_id INTEGER,
|
456 |
-
streamlined_code TEXT, online_application BOOLEAN, authority_method TEXT,
|
457 |
-
hospital_type_id INTEGER, schedule_code TEXT, schedule_year INTEGER, schedule_month TEXT,
|
458 |
-
FOREIGN KEY (drug_id) REFERENCES drugs(id),
|
459 |
-
FOREIGN KEY (brand_id) REFERENCES brands(id),
|
460 |
-
FOREIGN KEY (formulation_id) REFERENCES formulations(id),
|
461 |
-
FOREIGN KEY (indication_id) REFERENCES indications(id),
|
462 |
-
FOREIGN KEY (treatment_phase_id) REFERENCES treatment_phases(id),
|
463 |
-
FOREIGN KEY (hospital_type_id) REFERENCES hospital_types(id))''')
|
464 |
-
|
465 |
-
# Insert data
|
466 |
-
for table in ['drugs', 'brands', 'formulations', 'indications', 'treatment_phases', 'hospital_types']:
|
467 |
-
cursor.executemany(f"INSERT OR IGNORE INTO {table} (name) VALUES (?)",
|
468 |
-
[(item,) for item in processed_data[table]])
|
469 |
-
|
470 |
-
# Insert combinations
|
471 |
-
for combo in processed_data['combinations']:
|
472 |
-
cursor.execute('''INSERT INTO combinations
|
473 |
-
(pbs_code, drug_id, brand_id, formulation_id, indication_id,
|
474 |
-
treatment_phase_id, streamlined_code, online_application, authority_method,
|
475 |
-
hospital_type_id, schedule_code, schedule_year, schedule_month)
|
476 |
-
VALUES (?,
|
477 |
-
(SELECT id FROM drugs WHERE name = ?),
|
478 |
-
(SELECT id FROM brands WHERE name = ?),
|
479 |
-
(SELECT id FROM formulations WHERE name = ?),
|
480 |
-
(SELECT id FROM indications WHERE name = ?),
|
481 |
-
(SELECT id FROM treatment_phases WHERE name = ?),
|
482 |
-
?, ?, ?,
|
483 |
-
(SELECT id FROM hospital_types WHERE name = ?),
|
484 |
-
?, ?, ?)''',
|
485 |
-
(combo['pbs_code'], combo['drug'], combo['brand'], combo['formulation'],
|
486 |
-
combo['indication'], combo['treatment_phase'], combo['streamlined_code'],
|
487 |
-
combo['online_application'], combo['authority_method'], combo['hospital_type'],
|
488 |
-
combo['schedule_code'], combo['schedule_year'], combo['schedule_month']))
|
489 |
-
|
490 |
-
# Add last_updated column and insert timestamp
|
491 |
-
cursor.execute('''CREATE TABLE IF NOT EXISTS metadata
|
492 |
-
(key TEXT PRIMARY KEY, value TEXT)''')
|
493 |
-
cursor.execute('''INSERT OR REPLACE INTO metadata (key, value)
|
494 |
-
VALUES ('last_updated', ?)''', (datetime.datetime.now().isoformat(),))
|
495 |
-
|
496 |
-
conn.commit()
|
497 |
-
conn.close()
|
498 |
|
499 |
def main():
|
500 |
client = PBSPublicDataAPIClient("2384af7c667342ceb5a736fe29f1dc6b", rate_limit=0.2)
|
@@ -505,8 +431,8 @@ def main():
|
|
505 |
|
506 |
print(f"Data fetched for {len(data)} items.")
|
507 |
|
508 |
-
client.
|
509 |
-
print("Data saved to
|
510 |
|
511 |
except Exception as e:
|
512 |
print(f"An error occurred: {str(e)}")
|
|
|
1 |
+
import datetime
|
2 |
import requests
|
3 |
import csv
|
4 |
from io import StringIO
|
|
|
5 |
import time
|
6 |
from requests.adapters import HTTPAdapter
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
+
from datasets import Dataset
|
|
|
|
|
9 |
|
10 |
class PBSPublicDataAPIClient:
|
11 |
def __init__(self, subscription_key, base_url='https://data-api.health.gov.au/pbs/api/v3', rate_limit=0.2):
|
|
|
387 |
|
388 |
def preprocess_data(self, data):
|
389 |
processed = {
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
'combinations': []
|
391 |
}
|
392 |
|
393 |
for pbs_code, item in data.items():
|
|
|
|
|
|
|
|
|
|
|
394 |
for restriction in item['restrictions']:
|
395 |
+
for brand in item['brands']:
|
|
|
|
|
|
|
396 |
processed['combinations'].append({
|
397 |
'pbs_code': pbs_code,
|
398 |
'drug': item['name'],
|
399 |
+
'brand': brand,
|
400 |
'formulation': item['li_form'],
|
401 |
'indication': restriction['indications'],
|
402 |
'treatment_phase': restriction['treatment_phase'],
|
|
|
409 |
'schedule_month': item['schedule_month']
|
410 |
})
|
411 |
|
412 |
+
return processed
|
413 |
|
414 |
+
def save_data_to_hf(self, data, dataset_name="cmcmaster/rheumatology-biologics-dataset"):
|
415 |
processed_data = self.preprocess_data(data)
|
416 |
+
|
417 |
+
# Create a Dataset from the combinations
|
418 |
+
dataset = Dataset.from_list(processed_data['combinations'])
|
419 |
+
|
420 |
+
# Push the dataset to the Hugging Face Hub
|
421 |
+
dataset.push_to_hub(dataset_name)
|
422 |
+
|
423 |
+
print(f"Data saved to Hugging Face Hub: {dataset_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
425 |
def main():
|
426 |
client = PBSPublicDataAPIClient("2384af7c667342ceb5a736fe29f1dc6b", rate_limit=0.2)
|
|
|
431 |
|
432 |
print(f"Data fetched for {len(data)} items.")
|
433 |
|
434 |
+
client.save_data_to_hf(data)
|
435 |
+
print("Data saved to Hugging Face Hub")
|
436 |
|
437 |
except Exception as e:
|
438 |
print(f"An error occurred: {str(e)}")
|