pritamdeka commited on
Commit
24aa96d
1 Parent(s): 8977631

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/muril-base-cased
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: temp_assamese
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # temp_assamese
17
+
18
+ This model is a fine-tuned version of [google/muril-base-cased](https://huggingface.co/google/muril-base-cased) on an unknown dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 1.4149
21
+ - Accuracy: 0.7014
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 16
42
+ - eval_batch_size: 16
43
+ - seed: 42
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: linear
46
+ - num_epochs: 2.0
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
+ |:-------------:|:------:|:-----:|:---------------:|:--------:|
53
+ | 2.2163 | 0.1409 | 2000 | 1.8646 | 0.6320 |
54
+ | 1.9456 | 0.2818 | 4000 | 1.7492 | 0.6495 |
55
+ | 1.8391 | 0.4227 | 6000 | 1.6770 | 0.6606 |
56
+ | 1.7704 | 0.5637 | 8000 | 1.6166 | 0.6707 |
57
+ | 1.7213 | 0.7046 | 10000 | 1.5818 | 0.6759 |
58
+ | 1.6802 | 0.8455 | 12000 | 1.5403 | 0.6820 |
59
+ | 1.6432 | 0.9864 | 14000 | 1.5153 | 0.6858 |
60
+ | 1.6074 | 1.1273 | 16000 | 1.4965 | 0.6885 |
61
+ | 1.5833 | 1.2682 | 18000 | 1.4678 | 0.6934 |
62
+ | 1.5649 | 1.4091 | 20000 | 1.4508 | 0.6950 |
63
+ | 1.553 | 1.5501 | 22000 | 1.4367 | 0.6985 |
64
+ | 1.5345 | 1.6910 | 24000 | 1.4231 | 0.7001 |
65
+ | 1.5261 | 1.8319 | 26000 | 1.4157 | 0.7013 |
66
+ | 1.5148 | 1.9728 | 28000 | 1.4098 | 0.7027 |
67
+
68
+
69
+ ### Framework versions
70
+
71
+ - Transformers 4.43.0.dev0
72
+ - Pytorch 2.3.0+cu121
73
+ - Datasets 2.20.0
74
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.7014180145490598,
4
+ "eval_loss": 1.4148573875427246,
5
+ "eval_runtime": 102.6889,
6
+ "eval_samples": 11967,
7
+ "eval_samples_per_second": 116.536,
8
+ "eval_steps_per_second": 7.284,
9
+ "perplexity": 4.115899445582212,
10
+ "total_flos": 1.197729267088466e+17,
11
+ "train_loss": 1.6903211268009264,
12
+ "train_runtime": 8975.6005,
13
+ "train_samples": 227086,
14
+ "train_samples_per_second": 50.601,
15
+ "train_steps_per_second": 3.163
16
+ }
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/muril-base-cased",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.43.0.dev0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 197285
26
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.7014180145490598,
4
+ "eval_loss": 1.4148573875427246,
5
+ "eval_runtime": 102.6889,
6
+ "eval_samples": 11967,
7
+ "eval_samples_per_second": 116.536,
8
+ "eval_steps_per_second": 7.284,
9
+ "perplexity": 4.115899445582212
10
+ }
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.43.0.dev0"
5
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c367e91281dc25023d6a3185659ce4a7ff0b953b29b3096d4a655e704d5be21c
3
+ size 951043900
runs/Jul09_18-29-14_c3109aa98e74/events.out.tfevents.1720550282.c3109aa98e74.7081.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40fc0dfa0241280baccbcc3fbdb0e7b254e1bc78a8730e3578f0ae444c83beb6
3
+ size 12820
runs/Jul09_18-29-14_c3109aa98e74/events.out.tfevents.1720559363.c3109aa98e74.7081.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d4b45a85641a56f4771ade9ddf44479274f65c41d88bdf57acbd3dc8b46e2b1
3
+ size 417
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "lowercase": false,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": false,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 1.197729267088466e+17,
4
+ "train_loss": 1.6903211268009264,
5
+ "train_runtime": 8975.6005,
6
+ "train_samples": 227086,
7
+ "train_samples_per_second": 50.601,
8
+ "train_steps_per_second": 3.163
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7026871898897578,
3
+ "best_model_checkpoint": "/content/temp_assamese/checkpoint-28000",
4
+ "epoch": 2.0,
5
+ "eval_steps": 2000,
6
+ "global_step": 28386,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.14091453533431975,
13
+ "grad_norm": 12.538192749023438,
14
+ "learning_rate": 4.648418234340873e-05,
15
+ "loss": 2.2163,
16
+ "step": 2000
17
+ },
18
+ {
19
+ "epoch": 0.14091453533431975,
20
+ "eval_accuracy": 0.6319857833787309,
21
+ "eval_loss": 1.8646236658096313,
22
+ "eval_runtime": 102.3117,
23
+ "eval_samples_per_second": 116.966,
24
+ "eval_steps_per_second": 7.311,
25
+ "step": 2000
26
+ },
27
+ {
28
+ "epoch": 0.2818290706686395,
29
+ "grad_norm": 12.645801544189453,
30
+ "learning_rate": 4.296131896005073e-05,
31
+ "loss": 1.9456,
32
+ "step": 4000
33
+ },
34
+ {
35
+ "epoch": 0.2818290706686395,
36
+ "eval_accuracy": 0.6494802758779904,
37
+ "eval_loss": 1.749164342880249,
38
+ "eval_runtime": 102.5618,
39
+ "eval_samples_per_second": 116.681,
40
+ "eval_steps_per_second": 7.293,
41
+ "step": 4000
42
+ },
43
+ {
44
+ "epoch": 0.4227436060029592,
45
+ "grad_norm": 9.79688835144043,
46
+ "learning_rate": 3.943845557669274e-05,
47
+ "loss": 1.8391,
48
+ "step": 6000
49
+ },
50
+ {
51
+ "epoch": 0.4227436060029592,
52
+ "eval_accuracy": 0.6606430993204072,
53
+ "eval_loss": 1.6770141124725342,
54
+ "eval_runtime": 102.5061,
55
+ "eval_samples_per_second": 116.744,
56
+ "eval_steps_per_second": 7.297,
57
+ "step": 6000
58
+ },
59
+ {
60
+ "epoch": 0.563658141337279,
61
+ "grad_norm": 10.446520805358887,
62
+ "learning_rate": 3.5917353625026426e-05,
63
+ "loss": 1.7704,
64
+ "step": 8000
65
+ },
66
+ {
67
+ "epoch": 0.563658141337279,
68
+ "eval_accuracy": 0.6706733344622967,
69
+ "eval_loss": 1.6165672540664673,
70
+ "eval_runtime": 102.6569,
71
+ "eval_samples_per_second": 116.573,
72
+ "eval_steps_per_second": 7.286,
73
+ "step": 8000
74
+ },
75
+ {
76
+ "epoch": 0.7045726766715987,
77
+ "grad_norm": 9.16817569732666,
78
+ "learning_rate": 3.23962516733601e-05,
79
+ "loss": 1.7213,
80
+ "step": 10000
81
+ },
82
+ {
83
+ "epoch": 0.7045726766715987,
84
+ "eval_accuracy": 0.6759152415500662,
85
+ "eval_loss": 1.5817841291427612,
86
+ "eval_runtime": 102.3347,
87
+ "eval_samples_per_second": 116.94,
88
+ "eval_steps_per_second": 7.309,
89
+ "step": 10000
90
+ },
91
+ {
92
+ "epoch": 0.8454872120059184,
93
+ "grad_norm": 9.296396255493164,
94
+ "learning_rate": 2.8875149721693794e-05,
95
+ "loss": 1.6802,
96
+ "step": 12000
97
+ },
98
+ {
99
+ "epoch": 0.8454872120059184,
100
+ "eval_accuracy": 0.6820033844378066,
101
+ "eval_loss": 1.5402722358703613,
102
+ "eval_runtime": 102.6712,
103
+ "eval_samples_per_second": 116.557,
104
+ "eval_steps_per_second": 7.285,
105
+ "step": 12000
106
+ },
107
+ {
108
+ "epoch": 0.9864017473402381,
109
+ "grad_norm": 7.622576713562012,
110
+ "learning_rate": 2.5354047770027478e-05,
111
+ "loss": 1.6432,
112
+ "step": 14000
113
+ },
114
+ {
115
+ "epoch": 0.9864017473402381,
116
+ "eval_accuracy": 0.6857634747617221,
117
+ "eval_loss": 1.5153496265411377,
118
+ "eval_runtime": 102.6162,
119
+ "eval_samples_per_second": 116.619,
120
+ "eval_steps_per_second": 7.289,
121
+ "step": 14000
122
+ },
123
+ {
124
+ "epoch": 1.127316282674558,
125
+ "grad_norm": 11.516377449035645,
126
+ "learning_rate": 2.1832945818361165e-05,
127
+ "loss": 1.6074,
128
+ "step": 16000
129
+ },
130
+ {
131
+ "epoch": 1.127316282674558,
132
+ "eval_accuracy": 0.6885159496543541,
133
+ "eval_loss": 1.496454119682312,
134
+ "eval_runtime": 102.8787,
135
+ "eval_samples_per_second": 116.321,
136
+ "eval_steps_per_second": 7.271,
137
+ "step": 16000
138
+ },
139
+ {
140
+ "epoch": 1.2682308180088775,
141
+ "grad_norm": 9.480605125427246,
142
+ "learning_rate": 1.831008243500317e-05,
143
+ "loss": 1.5833,
144
+ "step": 18000
145
+ },
146
+ {
147
+ "epoch": 1.2682308180088775,
148
+ "eval_accuracy": 0.6934179945828158,
149
+ "eval_loss": 1.4677945375442505,
150
+ "eval_runtime": 102.7153,
151
+ "eval_samples_per_second": 116.506,
152
+ "eval_steps_per_second": 7.282,
153
+ "step": 18000
154
+ },
155
+ {
156
+ "epoch": 1.4091453533431975,
157
+ "grad_norm": 8.643658638000488,
158
+ "learning_rate": 1.4788980483336856e-05,
159
+ "loss": 1.5649,
160
+ "step": 20000
161
+ },
162
+ {
163
+ "epoch": 1.4091453533431975,
164
+ "eval_accuracy": 0.6950099353567151,
165
+ "eval_loss": 1.4508079290390015,
166
+ "eval_runtime": 102.6391,
167
+ "eval_samples_per_second": 116.593,
168
+ "eval_steps_per_second": 7.288,
169
+ "step": 20000
170
+ },
171
+ {
172
+ "epoch": 1.550059888677517,
173
+ "grad_norm": 7.6539506912231445,
174
+ "learning_rate": 1.1267878531670542e-05,
175
+ "loss": 1.553,
176
+ "step": 22000
177
+ },
178
+ {
179
+ "epoch": 1.550059888677517,
180
+ "eval_accuracy": 0.698540482055296,
181
+ "eval_loss": 1.436693787574768,
182
+ "eval_runtime": 102.5486,
183
+ "eval_samples_per_second": 116.696,
184
+ "eval_steps_per_second": 7.294,
185
+ "step": 22000
186
+ },
187
+ {
188
+ "epoch": 1.690974424011837,
189
+ "grad_norm": 8.063584327697754,
190
+ "learning_rate": 7.746776580004228e-06,
191
+ "loss": 1.5345,
192
+ "step": 24000
193
+ },
194
+ {
195
+ "epoch": 1.690974424011837,
196
+ "eval_accuracy": 0.7001222876777317,
197
+ "eval_loss": 1.4230775833129883,
198
+ "eval_runtime": 102.8829,
199
+ "eval_samples_per_second": 116.317,
200
+ "eval_steps_per_second": 7.27,
201
+ "step": 24000
202
+ },
203
+ {
204
+ "epoch": 1.8318889593461565,
205
+ "grad_norm": 8.720465660095215,
206
+ "learning_rate": 4.2256746283379135e-06,
207
+ "loss": 1.5261,
208
+ "step": 26000
209
+ },
210
+ {
211
+ "epoch": 1.8318889593461565,
212
+ "eval_accuracy": 0.701337358949075,
213
+ "eval_loss": 1.4157360792160034,
214
+ "eval_runtime": 102.7046,
215
+ "eval_samples_per_second": 116.519,
216
+ "eval_steps_per_second": 7.283,
217
+ "step": 26000
218
+ },
219
+ {
220
+ "epoch": 1.9728034946804764,
221
+ "grad_norm": 9.144937515258789,
222
+ "learning_rate": 7.045726766715987e-07,
223
+ "loss": 1.5148,
224
+ "step": 28000
225
+ },
226
+ {
227
+ "epoch": 1.9728034946804764,
228
+ "eval_accuracy": 0.7026871898897578,
229
+ "eval_loss": 1.4097787141799927,
230
+ "eval_runtime": 102.5398,
231
+ "eval_samples_per_second": 116.706,
232
+ "eval_steps_per_second": 7.295,
233
+ "step": 28000
234
+ },
235
+ {
236
+ "epoch": 2.0,
237
+ "step": 28386,
238
+ "total_flos": 1.197729267088466e+17,
239
+ "train_loss": 1.6903211268009264,
240
+ "train_runtime": 8975.6005,
241
+ "train_samples_per_second": 50.601,
242
+ "train_steps_per_second": 3.163
243
+ }
244
+ ],
245
+ "logging_steps": 2000,
246
+ "max_steps": 28386,
247
+ "num_input_tokens_seen": 0,
248
+ "num_train_epochs": 2,
249
+ "save_steps": 2000,
250
+ "stateful_callbacks": {
251
+ "TrainerControl": {
252
+ "args": {
253
+ "should_epoch_stop": false,
254
+ "should_evaluate": false,
255
+ "should_log": false,
256
+ "should_save": true,
257
+ "should_training_stop": true
258
+ },
259
+ "attributes": {}
260
+ }
261
+ },
262
+ "total_flos": 1.197729267088466e+17,
263
+ "train_batch_size": 16,
264
+ "trial_name": null,
265
+ "trial_params": null
266
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f14d1d4ab23358fb655d6b451b96dfc8de5e4a244f655a8173f25485d49272
3
+ size 5176
vocab.txt ADDED
The diff for this file is too large to render. See raw diff