sanchit-gandhi HF staff commited on
Commit
4b450eb
1 Parent(s): b5fbc1e

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -0
  2. tokenizer_config.json +12 -0
  3. vocab.json +121 -0
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "ἤ",
3
+ "unk_token": "<unk>"
4
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "is_uroman": false,
5
+ "language": "grc",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "normalize": true,
8
+ "pad_token": "ἤ",
9
+ "phonemize": false,
10
+ "tokenizer_class": "VitsTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
vocab.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 31,
3
+ "'": 81,
4
+ "-": 21,
5
+ "_": 118,
6
+ "ΐ": 37,
7
+ "ά": 12,
8
+ "έ": 15,
9
+ "ή": 70,
10
+ "ί": 90,
11
+ "ΰ": 27,
12
+ "α": 102,
13
+ "β": 46,
14
+ "γ": 20,
15
+ "δ": 87,
16
+ "ε": 96,
17
+ "ζ": 73,
18
+ "η": 19,
19
+ "θ": 9,
20
+ "ι": 91,
21
+ "κ": 24,
22
+ "λ": 39,
23
+ "μ": 26,
24
+ "ν": 99,
25
+ "ξ": 22,
26
+ "ο": 77,
27
+ "π": 14,
28
+ "ρ": 2,
29
+ "ς": 23,
30
+ "σ": 56,
31
+ "τ": 44,
32
+ "υ": 40,
33
+ "φ": 69,
34
+ "χ": 55,
35
+ "ψ": 5,
36
+ "ω": 106,
37
+ "ϊ": 103,
38
+ "ϋ": 94,
39
+ "ό": 83,
40
+ "ύ": 72,
41
+ "ώ": 114,
42
+ "ἀ": 54,
43
+ "ἁ": 41,
44
+ "ἂ": 76,
45
+ "ἃ": 88,
46
+ "ἄ": 7,
47
+ "ἅ": 57,
48
+ "ἆ": 48,
49
+ "ἐ": 53,
50
+ "ἑ": 60,
51
+ "ἓ": 47,
52
+ "ἔ": 32,
53
+ "ἕ": 85,
54
+ "ἠ": 86,
55
+ "ἡ": 92,
56
+ "ἢ": 16,
57
+ "ἣ": 78,
58
+ "ἤ": 0,
59
+ "ἥ": 111,
60
+ "ἦ": 59,
61
+ "ἧ": 45,
62
+ "ἰ": 79,
63
+ "ἱ": 52,
64
+ "ἳ": 84,
65
+ "ἴ": 109,
66
+ "ἵ": 107,
67
+ "ἶ": 6,
68
+ "ἷ": 61,
69
+ "ὀ": 95,
70
+ "ὁ": 67,
71
+ "ὃ": 97,
72
+ "ὄ": 35,
73
+ "ὅ": 43,
74
+ "ὐ": 93,
75
+ "ὑ": 10,
76
+ "ὒ": 25,
77
+ "ὓ": 28,
78
+ "ὔ": 42,
79
+ "ὕ": 110,
80
+ "ὖ": 80,
81
+ "ὗ": 3,
82
+ "ὠ": 29,
83
+ "ὡ": 1,
84
+ "ὢ": 8,
85
+ "ὤ": 75,
86
+ "ὥ": 100,
87
+ "ὦ": 113,
88
+ "ὧ": 74,
89
+ "ὰ": 71,
90
+ "ὲ": 112,
91
+ "ὴ": 105,
92
+ "ὶ": 34,
93
+ "ὸ": 33,
94
+ "ὺ": 117,
95
+ "ὼ": 30,
96
+ "ᾄ": 58,
97
+ "ᾅ": 18,
98
+ "ᾐ": 108,
99
+ "ᾑ": 50,
100
+ "ᾔ": 104,
101
+ "ᾖ": 98,
102
+ "ᾗ": 49,
103
+ "ᾠ": 17,
104
+ "ᾧ": 4,
105
+ "ᾳ": 38,
106
+ "ᾶ": 36,
107
+ "ᾷ": 115,
108
+ "ῃ": 116,
109
+ "ῄ": 63,
110
+ "ῆ": 51,
111
+ "ῇ": 13,
112
+ "ῒ": 62,
113
+ "ῖ": 65,
114
+ "ῥ": 68,
115
+ "ῦ": 101,
116
+ "ῳ": 64,
117
+ "ῴ": 89,
118
+ "ῶ": 82,
119
+ "ῷ": 66,
120
+ "–": 11
121
+ }