Daryl Fung commited on
Commit
2a000a7
1 Parent(s): c082b57

added top 10

Browse files
DAI scraper/scrap_assessment.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.keys import Keys
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.support.ui import Select
7
+ import csv
8
+
9
+ # Set up the Selenium driver (ensure you have the appropriate webdriver installed)
10
+ driver = webdriver.Chrome()
11
+
12
+ # Open the webpage
13
+
14
+ mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
15
+ bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
16
+ ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
17
+ sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
18
+ ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
19
+ nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
20
+ hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
21
+ nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
22
+ cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"
23
+
24
+ jurisdictions = {
25
+ 'mchp': mchp,
26
+ 'bc': bc,
27
+ 'ab': ab,
28
+ 'sk': sk,
29
+ 'ices': ices,
30
+ 'nb': nb,
31
+ 'hdns': hdns,
32
+ 'nlchi': nlchi,
33
+ 'cihi': cihi
34
+ }
35
+
36
+ dataset_assessments = []
37
+
38
+ for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
39
+ driver.get(jurisdiction)
40
+ while True:
41
+ try:
42
+ # Wait for the page to load after login (adjust the timeout as needed)
43
+ WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))
44
+
45
+ title = driver.find_element(By.CLASS_NAME, 'panel-title').text
46
+ dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
47
+ dataset_dict = {'dataset': dataset}
48
+
49
+ # Find the table element with class "table"
50
+ table = driver.find_element(By.CLASS_NAME, "table")
51
+
52
+ # Find the tbody element within the table
53
+ tbody = table.find_element(By.TAG_NAME, "tbody")
54
+
55
+ # Find the first tr element within the tbody
56
+ first_tr = tbody.find_element(By.TAG_NAME, "tr")
57
+
58
+ # Extract the text or perform any other desired actions with the first tr block
59
+ tr = first_tr.find_elements(By.TAG_NAME, "label") # should return 8 if there is discussion
60
+
61
+ rationale = ""
62
+ discussion = ""
63
+ if len(tr) == 6:
64
+ rationale = tr[3].text
65
+ elif len(tr) == 8:
66
+ rationale = tr[3].text
67
+ discussion = tr[5].text
68
+
69
+ dataset_dict['rationale'] = rationale
70
+ dataset_dict['discussion'] = discussion
71
+ dataset_assessments.append(dataset_dict)
72
+
73
+ next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
74
+ if len(next_button) == 0:
75
+ break
76
+ next_button[0].click()
77
+
78
+ except:
79
+ # If the table element is not found, perform login
80
+
81
+ # Find the login form elements (e.g., username and password inputs)
82
+ username_input = driver.find_element('name', 'username')
83
+ password_input = driver.find_element('name', 'password')
84
+
85
+ # Fill in the login credentials
86
+ username_input.send_keys("dfung") # Replace with your username
87
+ password_input.send_keys("Daryl_1212hdrnhdrn") # Replace with your password
88
+
89
+ # Submit the login form
90
+ password_input.send_keys(Keys.RETURN)
91
+
92
+
93
+ # Define the CSV file path
94
+ csv_file = f'{jurisdiction_name}_assessment.csv'
95
+
96
+ # Extract the column names from the first dictionary
97
+ header = list(dataset_assessments[0].keys())
98
+
99
+ # Open the CSV file in write mode
100
+ with open(csv_file, mode='w', newline='') as file:
101
+ writer = csv.DictWriter(file, fieldnames=header)
102
+
103
+ # Write the header row
104
+ writer.writeheader()
105
+
106
+ # Write the data rows
107
+ for row in dataset_assessments:
108
+ writer.writerow(row)
keyphrase_extraction.py CHANGED
@@ -3,8 +3,18 @@ import spacy
3
  from spacy import displacy
4
  import pandas as pd
5
  import seaborn as sns
 
6
  import matplotlib.pyplot as plt
7
  from pathlib import Path
 
 
 
 
 
 
 
 
 
8
  import pytextrank
9
 
10
  # Load the pre-trained NLP model
@@ -42,6 +52,39 @@ def get_top_key_phrases(text, top_n, save_output):
42
  plt.savefig(save_output, dpi=300, bbox_inches="tight")
43
  plt.close()
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def display_key_phrases(text, save_output):
47
  text = text.replace('\n', ' \n')
@@ -69,5 +112,6 @@ def display_key_phrases(text, save_output):
69
 
70
 
71
  if __name__ == '__main__':
72
- get_top_key_phrases(text, 10)
73
- display_key_phrases(text)
 
 
3
  from spacy import displacy
4
  import pandas as pd
5
  import seaborn as sns
6
+ import textrank
7
  import matplotlib.pyplot as plt
8
  from pathlib import Path
9
+ from nltk.tokenize import sent_tokenize, word_tokenize
10
+ from nltk.corpus import stopwords
11
+ from nltk.probability import FreqDist
12
+ from nltk.tokenize import word_tokenize
13
+ from nltk.stem import PorterStemmer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ import networkx as nx
16
+ import matplotlib.pyplot as plt
17
+ import numpy as np
18
  import pytextrank
19
 
20
  # Load the pre-trained NLP model
 
52
  plt.savefig(save_output, dpi=300, bbox_inches="tight")
53
  plt.close()
54
 
55
+ def visualize_textrank(text):
56
+ # Get text
57
+ # Generate TextRank
58
+ tr = textrank.TextRank()
59
+ tr.calculate_scores(text)
60
+
61
+ # Get top 10 words
62
+ words = [w for w, s in tr.top_words(10)]
63
+
64
+ # Create graph
65
+ G = nx.Graph()
66
+
67
+ # Add nodes
68
+ for w in words:
69
+ G.add_node(w)
70
+
71
+ # Find co-occurrence counts
72
+ counts = {}
73
+ for i in range(len(words) - 1):
74
+ w1 = words[i]
75
+ w2 = words[i + 1]
76
+ key = (w1, w2)
77
+ counts[key] = counts.get(key, 0) + 1
78
+
79
+ # Add edges with weights
80
+ for key, count in counts.items():
81
+ w1, w2 = key
82
+ G.add_edge(w1, w2, weight=count)
83
+
84
+ # Draw graph with weighted edges
85
+ nx.draw(G, with_labels=True, width=list(e[2]['weight'] for e in G.edges()))
86
+ plt.show()
87
+
88
 
89
  def display_key_phrases(text, save_output):
90
  text = text.replace('\n', ' \n')
 
112
 
113
 
114
  if __name__ == '__main__':
115
+ visualize_textrank(text)
116
+ # get_top_key_phrases(text, 10, 'test_results/keyphrase.png')
117
+ # display_key_phrases(text)
keyword_extraction.py CHANGED
@@ -32,7 +32,7 @@ Captures administrative, clinical and demographic information on discharges for
32
  """
33
 
34
  def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
35
- keyword_onegram = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, n_grams), stop_words=None)
36
  words = list(zip(*keyword_onegram))[0]
37
  scores = list(zip(*keyword_onegram))[1]
38
  keyword_df = pd.DataFrame({'words': words, 'scores': scores})
@@ -47,8 +47,8 @@ def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
47
 
48
  if __name__ == '__main__':
49
  kw_model = KeyBERT()
50
- keyword_extract(kw_model, 1)
51
- keyword_extract(kw_model, 2)
52
- keyword_extract(kw_model, 3)
53
  keywords = kw_model.extract_keywords(test_doc, highlight=True)
54
  print(keywords)
 
32
  """
33
 
34
  def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
35
+ keyword_onegram = kw_model.extract_keywords(doc, top_n=10, keyphrase_ngram_range=(1, n_grams), stop_words=None)
36
  words = list(zip(*keyword_onegram))[0]
37
  scores = list(zip(*keyword_onegram))[1]
38
  keyword_df = pd.DataFrame({'words': words, 'scores': scores})
 
47
 
48
  if __name__ == '__main__':
49
  kw_model = KeyBERT()
50
+ keyword_extract(test_doc, kw_model, 1)
51
+ keyword_extract(test_doc, kw_model, 2)
52
+ keyword_extract(test_doc, kw_model, 3)
53
  keywords = kw_model.extract_keywords(test_doc, highlight=True)
54
  print(keywords)