Shredder commited on
Commit
57a4ee5
1 Parent(s): 4a6fb9b

Update fincat_utils.py

Browse files
Files changed (1) hide show
  1. fincat_utils.py +4 -38
fincat_utils.py CHANGED
@@ -29,26 +29,7 @@ def extract_context_words(x, window = 6):
29
  """The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
30
 
31
  def bert_text_preparation(text, tokenizer):
32
- """Preparing the input for BERT
33
-
34
- Takes a string argument and performs
35
- pre-processing like adding special tokens,
36
- tokenization, tokens to ids, and tokens to
37
- segment ids. All tokens are mapped to seg-
38
- ment id = 1.
39
-
40
- Args:
41
- text (str): Text to be converted
42
- tokenizer (obj): Tokenizer object
43
- to convert text into BERT-re-
44
- adable tokens and ids
45
-
46
- Returns:
47
- list: List of BERT-readable tokens
48
- obj: Torch tensor with token ids
49
- obj: Torch tensor segment ids
50
-
51
- """
52
  marked_text = "[CLS] " + text + " [SEP]"
53
  tokenized_text = tokenizer.tokenize(marked_text)
54
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
@@ -61,22 +42,7 @@ def bert_text_preparation(text, tokenizer):
61
  return tokenized_text, tokens_tensor, segments_tensors
62
 
63
  def get_bert_embeddings(tokens_tensor, segments_tensors, model):
64
- """Get embeddings from an embedding model
65
-
66
- Args:
67
- tokens_tensor (obj): Torch tensor size [n_tokens]
68
- with token ids for each token in text
69
- segments_tensors (obj): Torch tensor size [n_tokens]
70
- with segment ids for each token in text
71
- model (obj): Embedding model to generate embeddings
72
- from token and segment ids
73
-
74
- Returns:
75
- list: List of list of floats of size
76
- [n_tokens, n_embedding_dimensions]
77
- containing embeddings for each token
78
- """
79
-
80
  # Gradient calculation id disabled
81
  # Model is in inference mode
82
  with torch.no_grad():
@@ -106,5 +72,5 @@ def bert_embedding_extract(context_text, word):
106
  word_embedding_all.append(word_embedding)
107
  word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
108
  return word_embedding_mean
109
- except:
110
- return ['None']
 
29
  """The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
30
 
31
  def bert_text_preparation(text, tokenizer):
32
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  marked_text = "[CLS] " + text + " [SEP]"
34
  tokenized_text = tokenizer.tokenize(marked_text)
35
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 
42
  return tokenized_text, tokens_tensor, segments_tensors
43
 
44
  def get_bert_embeddings(tokens_tensor, segments_tensors, model):
45
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Gradient calculation id disabled
47
  # Model is in inference mode
48
  with torch.no_grad():
 
72
  word_embedding_all.append(word_embedding)
73
  word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
74
  return word_embedding_mean
75
+ except:
76
+ return ['None']