babylm commited on
Commit
a68b420
1 Parent(s): eccedac

Upload 3 files

Browse files
Files changed (3) hide show
  1. config.json +4 -4
  2. configuration_ltgbert.py +26 -2
  3. modeling_ltgbert.py +44 -15
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "babylm/ltgbert-100m-2024",
3
  "architectures": [
4
  "LtgBertForMaskedLM"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "auto_map": {
8
- "AutoConfig": "ltg/ltg-bert-babylm--configuration_ltgbert.LtgBertConfig",
9
- "AutoModelForMaskedLM": "ltg/ltg-bert-babylm--modeling_ltgbert.LtgBertForMaskedLM",
10
  "AutoModelForSequenceClassification": "modeling_ltgbert.LtgBertForSequenceClassification"
11
  },
12
  "classifier_dropout": 0.2,
@@ -22,6 +22,6 @@
22
  "pad_token_id": 4,
23
  "position_bucket_size": 32,
24
  "torch_dtype": "float32",
25
- "transformers_version": "4.43.3",
26
  "vocab_size": 16384
27
  }
 
1
  {
2
+ "_name_or_path": "ltg/ltg-bert-babylm",
3
  "architectures": [
4
  "LtgBertForMaskedLM"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "auto_map": {
8
+ "AutoConfig": "configuration_ltgbert.LtgBertConfig",
9
+ "AutoModelForMaskedLM": "modeling_ltgbert.LtgBertForMaskedLM",
10
  "AutoModelForSequenceClassification": "modeling_ltgbert.LtgBertForSequenceClassification"
11
  },
12
  "classifier_dropout": 0.2,
 
22
  "pad_token_id": 4,
23
  "position_bucket_size": 32,
24
  "torch_dtype": "float32",
25
+ "transformers_version": "4.40.2",
26
  "vocab_size": 16384
27
  }
configuration_ltgbert.py CHANGED
@@ -19,6 +19,30 @@
19
  from transformers.configuration_utils import PretrainedConfig
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class LtgBertConfig(PretrainedConfig):
23
  r"""
24
  This is the configuration class to store the configuration of a [`LtgBertModel`]. It is used to
@@ -49,7 +73,7 @@ class LtgBertConfig(PretrainedConfig):
49
  classifier_dropout (`float`, *optional*):
50
  The dropout ratio for the classification head.
51
  """
52
- model_type = "bert"
53
  def __init__(
54
  self,
55
  vocab_size=16384,
@@ -80,4 +104,4 @@ class LtgBertConfig(PretrainedConfig):
80
  self.output_all_encoded_layers = output_all_encoded_layers
81
  self.position_bucket_size = position_bucket_size
82
  self.layer_norm_eps = layer_norm_eps
83
- self.classifier_dropout = classifier_dropout
 
19
  from transformers.configuration_utils import PretrainedConfig
20
 
21
 
22
+ LTG_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
23
+ "bnc-bert-span": "https://huggingface.co/ltg/bnc-bert-span",
24
+ "bnc-bert-span-2x": "https://huggingface.co/ltg/bnc-bert-span-2x",
25
+ "bnc-bert-span-0.5x": "https://huggingface.co/ltg/bnc-bert-span-0.5x",
26
+ "bnc-bert-span-0.25x": "https://huggingface.co/ltg/bnc-bert-span-0.25x",
27
+ "bnc-bert-span-order": "https://huggingface.co/ltg/bnc-bert-span-order",
28
+ "bnc-bert-span-document": "https://huggingface.co/ltg/bnc-bert-span-document",
29
+ "bnc-bert-span-word": "https://huggingface.co/ltg/bnc-bert-span-word",
30
+ "bnc-bert-span-subword": "https://huggingface.co/ltg/bnc-bert-span-subword",
31
+
32
+ "norbert3-xs": "https://huggingface.co/ltg/norbert3-xs/config.json",
33
+ "norbert3-small": "https://huggingface.co/ltg/norbert3-small/config.json",
34
+ "norbert3-base": "https://huggingface.co/ltg/norbert3-base/config.json",
35
+ "norbert3-large": "https://huggingface.co/ltg/norbert3-large/config.json",
36
+
37
+ "norbert3-oversampled-base": "https://huggingface.co/ltg/norbert3-oversampled-base/config.json",
38
+ "norbert3-ncc-base": "https://huggingface.co/ltg/norbert3-ncc-base/config.json",
39
+ "norbert3-nak-base": "https://huggingface.co/ltg/norbert3-nak-base/config.json",
40
+ "norbert3-nb-base": "https://huggingface.co/ltg/norbert3-nb-base/config.json",
41
+ "norbert3-wiki-base": "https://huggingface.co/ltg/norbert3-wiki-base/config.json",
42
+ "norbert3-c4-base": "https://huggingface.co/ltg/norbert3-c4-base/config.json"
43
+ }
44
+
45
+
46
  class LtgBertConfig(PretrainedConfig):
47
  r"""
48
  This is the configuration class to store the configuration of a [`LtgBertModel`]. It is used to
 
73
  classifier_dropout (`float`, *optional*):
74
  The dropout ratio for the classification head.
75
  """
76
+ model_type = "ltgbert"
77
  def __init__(
78
  self,
79
  vocab_size=16384,
 
104
  self.output_all_encoded_layers = output_all_encoded_layers
105
  self.position_bucket_size = position_bucket_size
106
  self.layer_norm_eps = layer_norm_eps
107
+ self.classifier_dropout = classifier_dropout
modeling_ltgbert.py CHANGED
@@ -39,10 +39,34 @@ from transformers.pytorch_utils import softmax_backward_data
39
  from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
40
 
41
 
42
- _CHECKPOINT_FOR_DOC = "ltg/ltg-bert-bnc"
43
  _CONFIG_FOR_DOC = "LtgBertConfig"
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  class Encoder(nn.Module):
47
  def __init__(self, config, activation_checkpointing=False):
48
  super().__init__()
@@ -224,8 +248,10 @@ class Attention(nn.Module):
224
 
225
  attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
226
 
227
- pos = self.in_proj_qk(self.dropout(relative_embedding)) # shape: [2T-1, 2D]
228
- query_pos, key_pos = pos.view(-1, self.num_heads, 2*self.head_size).chunk(2, dim=2)
 
 
229
  query = query.view(batch_size, self.num_heads, query_len, self.head_size)
230
  key = key.view(batch_size, self.num_heads, query_len, self.head_size)
231
 
@@ -367,8 +393,6 @@ class LtgBertModel(LtgBertPreTrainedModel):
367
  ) -> List[torch.Tensor]:
368
  if input_ids is not None:
369
  input_shape = input_ids.size()
370
- # elif inputs_embeds is not None:
371
- # input_shape = inputs_embeds.size()[:-1]
372
  else:
373
  raise ValueError("You have to specify input_ids")
374
 
@@ -380,9 +404,7 @@ class LtgBertModel(LtgBertPreTrainedModel):
380
  else:
381
  attention_mask = ~attention_mask.bool()
382
  attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
383
-
384
- # if inputs_embeds is None:
385
- # static_embeddings, relative_embedding = self.embedding(input_ids.t())
386
  static_embeddings, relative_embedding = self.embedding(input_ids.t())
387
  contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
388
  contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
@@ -409,7 +431,8 @@ class LtgBertModel(LtgBertPreTrainedModel):
409
  )
410
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
411
 
412
- sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
 
413
 
414
  if not return_dict:
415
  return (
@@ -456,7 +479,8 @@ class LtgBertForMaskedLM(LtgBertModel):
456
  """
457
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
458
 
459
- sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
 
460
  subword_prediction = self.classifier(sequence_output)
461
 
462
  masked_lm_loss = None
@@ -554,8 +578,9 @@ class LtgBertForSequenceClassification(LtgBertModel):
554
  """
555
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
556
 
557
- sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, inputs_embeds,
558
- ~attention_mask)
 
559
  logits = self.head(sequence_output[:, 0, :])
560
 
561
  loss = None
@@ -628,7 +653,8 @@ class LtgBertForTokenClassification(LtgBertModel):
628
  ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
629
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
630
 
631
- sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
 
632
  logits = self.head(sequence_output)
633
 
634
  loss = None
@@ -684,7 +710,8 @@ class LtgBertForQuestionAnswering(LtgBertModel):
684
  ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
685
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
686
 
687
- sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
 
688
  logits = self.head(sequence_output)
689
 
690
  start_logits, end_logits = logits.split(1, dim=-1)
@@ -762,7 +789,8 @@ class LtgBertForMultipleChoice(LtgBertModel):
762
  flat_input_ids = input_ids.view(-1, input_ids.size(-1))
763
  flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
764
 
765
- sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(flat_input_ids, flat_attention_mask)
 
766
  logits = self.head(sequence_output)
767
  reshaped_logits = logits.view(-1, num_choices)
768
 
@@ -785,3 +813,4 @@ class LtgBertForMultipleChoice(LtgBertModel):
785
  hidden_states=contextualized_embeddings if output_hidden_states else None,
786
  attentions=attention_probs if output_attentions else None
787
  )
 
 
39
  from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
40
 
41
 
42
+ _CHECKPOINT_FOR_DOC = "ltg/bnc-bert-span"
43
  _CONFIG_FOR_DOC = "LtgBertConfig"
44
 
45
 
46
+ LTG_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
47
+ "bnc-bert-span",
48
+ "bnc-bert-span-2x",
49
+ "bnc-bert-span-0.5x",
50
+ "bnc-bert-span-0.25x",
51
+ "bnc-bert-span-order",
52
+ "bnc-bert-span-document",
53
+ "bnc-bert-span-word",
54
+ "bnc-bert-span-subword",
55
+
56
+ "norbert3-xs",
57
+ "norbert3-small",
58
+ "norbert3-base",
59
+ "norbert3-large",
60
+
61
+ "norbert3-oversampled-base",
62
+ "norbert3-ncc-base",
63
+ "norbert3-nak-base",
64
+ "norbert3-nb-base",
65
+ "norbert3-wiki-base",
66
+ "norbert3-c4-base"
67
+ ]
68
+
69
+
70
  class Encoder(nn.Module):
71
  def __init__(self, config, activation_checkpointing=False):
72
  super().__init__()
 
248
 
249
  attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
250
 
251
+ query_pos, key_pos = self.in_proj_qk(self.dropout(relative_embedding)).chunk(2, dim=-1) # shape: [2T-1, D]
252
+ query_pos = query_pos.view(-1, self.num_heads, self.head_size) # shape: [2T-1, H, D]
253
+ key_pos = key_pos.view(-1, self.num_heads, self.head_size) # shape: [2T-1, H, D]
254
+
255
  query = query.view(batch_size, self.num_heads, query_len, self.head_size)
256
  key = key.view(batch_size, self.num_heads, query_len, self.head_size)
257
 
 
393
  ) -> List[torch.Tensor]:
394
  if input_ids is not None:
395
  input_shape = input_ids.size()
 
 
396
  else:
397
  raise ValueError("You have to specify input_ids")
398
 
 
404
  else:
405
  attention_mask = ~attention_mask.bool()
406
  attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
407
+
 
 
408
  static_embeddings, relative_embedding = self.embedding(input_ids.t())
409
  contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
410
  contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
 
431
  )
432
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
433
 
434
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
435
+ attention_mask=attention_mask)
436
 
437
  if not return_dict:
438
  return (
 
479
  """
480
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
481
 
482
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
483
+ attention_mask=attention_mask)
484
  subword_prediction = self.classifier(sequence_output)
485
 
486
  masked_lm_loss = None
 
578
  """
579
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
580
 
581
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
582
+ inputs_embeds=inputs_embeds,
583
+ attention_mask=attention_mask)
584
  logits = self.head(sequence_output[:, 0, :])
585
 
586
  loss = None
 
653
  ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
654
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
655
 
656
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
657
+ attention_mask=attention_mask)
658
  logits = self.head(sequence_output)
659
 
660
  loss = None
 
710
  ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
711
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
712
 
713
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
714
+ attention_mask=attention_mask)
715
  logits = self.head(sequence_output)
716
 
717
  start_logits, end_logits = logits.split(1, dim=-1)
 
789
  flat_input_ids = input_ids.view(-1, input_ids.size(-1))
790
  flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
791
 
792
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=flat_input_ids,
793
+ attention_mask=flat_attention_mask)
794
  logits = self.head(sequence_output)
795
  reshaped_logits = logits.view(-1, num_choices)
796
 
 
813
  hidden_states=contextualized_embeddings if output_hidden_states else None,
814
  attentions=attention_probs if output_attentions else None
815
  )
816
+