Azazelle Blackroot commited on
Commit
ad069e0
0 Parent(s):

Duplicate from Blackroot/Llama-3-8B-Abomination-LORA

Browse files

Co-authored-by: Coffee Vampire <[email protected]>

Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +61 -0
  3. adapter_config.json +34 -0
  4. adapter_model.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Experimental model focused on RP and storytelling. This method attempts to bring some of the intrigue and style of the base model back into the instruct model.
2
+
3
+ This is a model trained in four stages (Use with Llama-8B-Instruct or Llama-8B-Instruct abliterations)
4
+
5
+
6
+ Base Model -- 1 Gig of semi-structured pretraining data (Uniform distribution centered around 4096 ctx length, b/w 512-8192)
7
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/637f3b03932a61b89aefbf5c/hpdbVRrM1yt65-gNtRIfT.png)
8
+ - Base pretraining phase 1 (Constant LR, text completion -- 20,000 steps 2/3 epoch)
9
+ - Base pretraining phase 2 (Cosine LR, text completion -- 10,000 steps 1/3 epoch)
10
+
11
+
12
+ Merge LORA into instruct model -- 100 MB of structured story-instruct data (All samples attempt to be near 8192 ctx fullsize instructions)
13
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/637f3b03932a61b89aefbf5c/V1Jf07k8JdI0_OzIDc7FF.png)
14
+ - Story-instruct tune phase 1 (Constant LR, ~1250 steps, 1 epoch)
15
+ - Story-instruct tune phase 2 (Cosine LR, ~1250 steps, 1 epoch)
16
+
17
+ Trained using <https://github.com/unslothai/unsloth>
18
+ Rough script:
19
+ ```python
20
+ model = FastLanguageModel.get_peft_model(
21
+ model,
22
+ r = 64,
23
+ target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
24
+ lora_alpha = 32,
25
+ lora_dropout = 0.05, # 0 for base pretraining
26
+ bias = "none",
27
+ use_gradient_checkpointing = "unsloth",
28
+ random_state = 3407,
29
+ max_seq_length = max_seq_length,
30
+ use_rslora = True,
31
+ loftq_config = None,
32
+ )
33
+
34
+ trainer = SFTTrainer(
35
+ model = model,
36
+ train_dataset = train_dataset,
37
+ dataset_text_field = "text",
38
+ max_seq_length = max_seq_length,
39
+ tokenizer = tokenizer,
40
+ args = TrainingArguments(
41
+ per_device_train_batch_size = 2,
42
+ warmup_steps = 45,
43
+ num_train_epochs=2, #1 for base-pretraining
44
+ fp16 = not torch.cuda.is_bf16_supported(),
45
+ bf16 = torch.cuda.is_bf16_supported(),
46
+ logging_steps = 15,
47
+ logging_dir="logs",
48
+ report_to="tensorboard",
49
+ output_dir = "outputs",
50
+ save_strategy=IntervalStrategy.STEPS,
51
+ save_steps=100,
52
+ save_total_limit=30,
53
+ optim = "adamw_torch_fused",
54
+ lr_scheduler_type="cosine", # <- Changed over time
55
+ learning_rate=5e-5,
56
+ weight_decay=0.10, # .15 for base pretraining
57
+ adam_beta1=0.88, # .9 for base pretraining
58
+ adam_beta2=0.99, # .999 for base pretraining
59
+ ),
60
+ )
61
+ ```
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Llama-3-8b-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": "unsloth",
22
+ "target_modules": [
23
+ "k_proj",
24
+ "v_proj",
25
+ "down_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "up_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": true
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eba742709d1858b982e2155d8f394693f9c70960cf88a1d6223eef5a318a2bc6
3
+ size 671149168