Spaces:

darylfunggg
/

text-analysis

App Files Files Community

Daryl Fung commited on Jun 5, 2023

Commit

2a000a7

•

1 Parent(s): c082b57

added top 10

Browse files

Files changed (3) hide show

DAI scraper/scrap_assessment.py +108 -0
keyphrase_extraction.py +46 -2
keyword_extraction.py +4 -4

DAI scraper/scrap_assessment.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import Select
+import csv
+# Set up the Selenium driver (ensure you have the appropriate webdriver installed)
+driver = webdriver.Chrome()
+# Open the webpage
+mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
+bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
+ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
+sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
+ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
+nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
+hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
+nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
+cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"
+jurisdictions = {
+    'mchp': mchp,
+    'bc': bc,
+    'ab': ab,
+    'sk': sk,
+    'ices': ices,
+    'nb': nb,
+    'hdns': hdns,
+    'nlchi': nlchi,
+    'cihi': cihi
+}
+dataset_assessments = []
+for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
+    driver.get(jurisdiction)
+    while True:
+        try:
+            # Wait for the page to load after login (adjust the timeout as needed)
+            WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))
+            title = driver.find_element(By.CLASS_NAME, 'panel-title').text
+            dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
+            dataset_dict = {'dataset': dataset}
+            # Find the table element with class "table"
+            table = driver.find_element(By.CLASS_NAME, "table")
+            # Find the tbody element within the table
+            tbody = table.find_element(By.TAG_NAME, "tbody")
+            # Find the first tr element within the tbody
+            first_tr = tbody.find_element(By.TAG_NAME, "tr")
+            # Extract the text or perform any other desired actions with the first tr block
+            tr = first_tr.find_elements(By.TAG_NAME, "label")  # should return 8 if there is discussion
+            rationale = ""
+            discussion = ""
+            if len(tr) == 6:
+                rationale = tr[3].text
+            elif len(tr) == 8:
+                rationale = tr[3].text
+                discussion = tr[5].text
+            dataset_dict['rationale'] = rationale
+            dataset_dict['discussion'] = discussion
+            dataset_assessments.append(dataset_dict)
+            next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
+            if len(next_button) == 0:
+                break
+            next_button[0].click()
+        except:
+            # If the table element is not found, perform login
+            # Find the login form elements (e.g., username and password inputs)
+            username_input = driver.find_element('name', 'username')
+            password_input = driver.find_element('name', 'password')
+            # Fill in the login credentials
+            username_input.send_keys("dfung")  # Replace with your username
+            password_input.send_keys("Daryl_1212hdrnhdrn")  # Replace with your password
+            # Submit the login form
+            password_input.send_keys(Keys.RETURN)
+    # Define the CSV file path
+    csv_file = f'{jurisdiction_name}_assessment.csv'
+    # Extract the column names from the first dictionary
+    header = list(dataset_assessments[0].keys())
+    # Open the CSV file in write mode
+    with open(csv_file, mode='w', newline='') as file:
+        writer = csv.DictWriter(file, fieldnames=header)
+        # Write the header row
+        writer.writeheader()
+        # Write the data rows
+        for row in dataset_assessments:
+            writer.writerow(row)

keyphrase_extraction.py CHANGED Viewed

@@ -3,8 +3,18 @@ import spacy
 from spacy import displacy
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from pathlib import Path
 import pytextrank
 # Load the pre-trained NLP model
@@ -42,6 +52,39 @@ def get_top_key_phrases(text, top_n, save_output):
     plt.savefig(save_output, dpi=300, bbox_inches="tight")
     plt.close()
 def display_key_phrases(text, save_output):
     text = text.replace('\n', '  \n')
@@ -69,5 +112,6 @@ def display_key_phrases(text, save_output):
 if __name__ == '__main__':
-    get_top_key_phrases(text, 10)
-    display_key_phrases(text)

 from spacy import displacy
 import pandas as pd
 import seaborn as sns
+import textrank
 import matplotlib.pyplot as plt
 from pathlib import Path
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from nltk.probability import FreqDist
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from sklearn.metrics.pairwise import cosine_similarity
+import networkx as nx
+import matplotlib.pyplot as plt
+import numpy as np
 import pytextrank
 # Load the pre-trained NLP model
     plt.savefig(save_output, dpi=300, bbox_inches="tight")
     plt.close()
+def visualize_textrank(text):
+    # Get text
+    # Generate TextRank
+    tr = textrank.TextRank()
+    tr.calculate_scores(text)
+    # Get top 10 words
+    words = [w for w, s in tr.top_words(10)]
+    # Create graph
+    G = nx.Graph()
+    # Add nodes
+    for w in words:
+        G.add_node(w)
+    # Find co-occurrence counts
+    counts = {}
+    for i in range(len(words) - 1):
+        w1 = words[i]
+        w2 = words[i + 1]
+        key = (w1, w2)
+        counts[key] = counts.get(key, 0) + 1
+    # Add edges with weights
+    for key, count in counts.items():
+        w1, w2 = key
+        G.add_edge(w1, w2, weight=count)
+    # Draw graph with weighted edges
+    nx.draw(G, with_labels=True, width=list(e[2]['weight'] for e in G.edges()))
+    plt.show()
 def display_key_phrases(text, save_output):
     text = text.replace('\n', '  \n')
 if __name__ == '__main__':
+    visualize_textrank(text)
+    # get_top_key_phrases(text, 10, 'test_results/keyphrase.png')
+    # display_key_phrases(text)

keyword_extraction.py CHANGED Viewed

@@ -32,7 +32,7 @@ Captures administrative, clinical and demographic information on discharges for
       """
 def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
-    keyword_onegram = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, n_grams), stop_words=None)
     words = list(zip(*keyword_onegram))[0]
     scores = list(zip(*keyword_onegram))[1]
     keyword_df = pd.DataFrame({'words': words, 'scores': scores})
@@ -47,8 +47,8 @@ def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
 if __name__ == '__main__':
     kw_model = KeyBERT()
-    keyword_extract(kw_model, 1)
-    keyword_extract(kw_model, 2)
-    keyword_extract(kw_model, 3)
     keywords = kw_model.extract_keywords(test_doc, highlight=True)
     print(keywords)

       """
 def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
+    keyword_onegram = kw_model.extract_keywords(doc, top_n=10, keyphrase_ngram_range=(1, n_grams), stop_words=None)
     words = list(zip(*keyword_onegram))[0]
     scores = list(zip(*keyword_onegram))[1]
     keyword_df = pd.DataFrame({'words': words, 'scores': scores})
 if __name__ == '__main__':
     kw_model = KeyBERT()
+    keyword_extract(test_doc, kw_model, 1)
+    keyword_extract(test_doc, kw_model, 2)
+    keyword_extract(test_doc, kw_model, 3)
     keywords = kw_model.extract_keywords(test_doc, highlight=True)
     print(keywords)