marisming commited on
Commit
0b88e34
·
verified ·
1 Parent(s): 20d85f4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ train_data/de_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
37
+ train_data/dna_4g.txt filter=lfs diff=lfs merge=lfs -text
38
+ train_data/en_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
39
+ train_data/es_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
40
+ train_data/fr_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
41
+ train_data/gene_eng.txt filter=lfs diff=lfs merge=lfs -text
42
+ train_data/gene_eng_zh.txt filter=lfs diff=lfs merge=lfs -text
43
+ train_data/gene_eng_zh_de_es.txt filter=lfs diff=lfs merge=lfs -text
44
+ train_data/ja_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
45
+ train_data/ko_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
46
+ train_data/protein_4g.txt filter=lfs diff=lfs merge=lfs -text
47
+ train_data/zh_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
get_data/.ipynb_checkpoints/get_wiki1-checkpoint.ipynb ADDED
@@ -0,0 +1,1422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "163a0ec0-2003-40f8-bf8a-63c1146f9130",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "\"\\nimport os\\n\\n# 设置环境变量\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
13
+ ]
14
+ },
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "output_type": "execute_result"
18
+ }
19
+ ],
20
+ "source": [
21
+ "import subprocess\n",
22
+ "import os\n",
23
+ "\n",
24
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
25
+ "output = result.stdout\n",
26
+ "for line in output.splitlines():\n",
27
+ " if '=' in line:\n",
28
+ " var, value = line.split('=', 1)\n",
29
+ " os.environ[var] = value\n",
30
+ "\n",
31
+ "\"\"\"\n",
32
+ "import os\n",
33
+ "\n",
34
+ "# 设置环境变量\n",
35
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
36
+ "\n",
37
+ "# 打印环境变量以确认设置成功\n",
38
+ "print(os.environ.get('HF_ENDPOINT'))\n",
39
+ "\"\"\""
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 2,
45
+ "id": "4aa15036-4be6-44ac-be6d-83389ae07c5d",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "# from datasets import load_dataset\n",
50
+ "\n",
51
+ "# # 加载特定语言的 Wikipedia 语料\n",
52
+ "# langs = [\"20220301.en\", \"20220301.fr\", \"20220301.de\", \"20220301.zh\", \"20220301.ja\"]\n",
53
+ "# datasets = {lang: load_dataset(\"wikimedia/wikipedia\", lang, split=\"train\") for lang in langs}"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 3,
59
+ "id": "02351414-fcad-44ba-964c-8ced0a88e609",
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "data": {
64
+ "application/vnd.jupyter.widget-view+json": {
65
+ "model_id": "408855cc406d47a3a47803d00829febd",
66
+ "version_major": 2,
67
+ "version_minor": 0
68
+ },
69
+ "text/plain": [
70
+ "Loading dataset shards: 0%| | 0/41 [00:00<?, ?it/s]"
71
+ ]
72
+ },
73
+ "metadata": {},
74
+ "output_type": "display_data"
75
+ },
76
+ {
77
+ "data": {
78
+ "text/plain": [
79
+ "DatasetDict({\n",
80
+ " train: Dataset({\n",
81
+ " features: ['id', 'url', 'title', 'text'],\n",
82
+ " num_rows: 6458670\n",
83
+ " })\n",
84
+ "})"
85
+ ]
86
+ },
87
+ "execution_count": 3,
88
+ "metadata": {},
89
+ "output_type": "execute_result"
90
+ }
91
+ ],
92
+ "source": [
93
+ "from datasets import load_dataset\n",
94
+ "datasets_en = load_dataset(\"legacy-datasets/wikipedia\", \"20220301.en\")\n",
95
+ "datasets_en"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 4,
101
+ "id": "f00a2e59-0aed-446e-9c7b-d21233bcdd1c",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "wiki = open(\"en_wiki.txt\",\"w\")\n",
106
+ "for item in datasets_en[\"train\"]:\n",
107
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
108
+ " wiki.write(line)\n",
109
+ "wiki.close()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 5,
115
+ "id": "ce2039da-ae06-49de-87da-bc452ee1f252",
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "data": {
120
+ "application/vnd.jupyter.widget-view+json": {
121
+ "model_id": "d240fe1bef53424e9ccbb8af29346f0c",
122
+ "version_major": 2,
123
+ "version_minor": 0
124
+ },
125
+ "text/plain": [
126
+ "train-00000-of-00015.parquet: 0%| | 0.00/764M [00:00<?, ?B/s]"
127
+ ]
128
+ },
129
+ "metadata": {},
130
+ "output_type": "display_data"
131
+ },
132
+ {
133
+ "data": {
134
+ "application/vnd.jupyter.widget-view+json": {
135
+ "model_id": "807912e6be304fcaa0786f2c863fa8ab",
136
+ "version_major": 2,
137
+ "version_minor": 0
138
+ },
139
+ "text/plain": [
140
+ "train-00001-of-00015.parquet: 0%| | 0.00/416M [00:00<?, ?B/s]"
141
+ ]
142
+ },
143
+ "metadata": {},
144
+ "output_type": "display_data"
145
+ },
146
+ {
147
+ "data": {
148
+ "application/vnd.jupyter.widget-view+json": {
149
+ "model_id": "ff2349e16ea44a1a9033f04096cf0fe4",
150
+ "version_major": 2,
151
+ "version_minor": 0
152
+ },
153
+ "text/plain": [
154
+ "train-00002-of-00015.parquet: 0%| | 0.00/342M [00:00<?, ?B/s]"
155
+ ]
156
+ },
157
+ "metadata": {},
158
+ "output_type": "display_data"
159
+ },
160
+ {
161
+ "data": {
162
+ "application/vnd.jupyter.widget-view+json": {
163
+ "model_id": "428705e16387469abc2f70d8f01032c8",
164
+ "version_major": 2,
165
+ "version_minor": 0
166
+ },
167
+ "text/plain": [
168
+ "train-00003-of-00015.parquet: 0%| | 0.00/306M [00:00<?, ?B/s]"
169
+ ]
170
+ },
171
+ "metadata": {},
172
+ "output_type": "display_data"
173
+ },
174
+ {
175
+ "data": {
176
+ "application/vnd.jupyter.widget-view+json": {
177
+ "model_id": "3edadedc0767441fa441450e541d0a41",
178
+ "version_major": 2,
179
+ "version_minor": 0
180
+ },
181
+ "text/plain": [
182
+ "train-00004-of-00015.parquet: 0%| | 0.00/281M [00:00<?, ?B/s]"
183
+ ]
184
+ },
185
+ "metadata": {},
186
+ "output_type": "display_data"
187
+ },
188
+ {
189
+ "data": {
190
+ "application/vnd.jupyter.widget-view+json": {
191
+ "model_id": "485c9747448e4fc792c748e3920d893c",
192
+ "version_major": 2,
193
+ "version_minor": 0
194
+ },
195
+ "text/plain": [
196
+ "train-00005-of-00015.parquet: 0%| | 0.00/261M [00:00<?, ?B/s]"
197
+ ]
198
+ },
199
+ "metadata": {},
200
+ "output_type": "display_data"
201
+ },
202
+ {
203
+ "data": {
204
+ "application/vnd.jupyter.widget-view+json": {
205
+ "model_id": "32acc84b51494535aff3fe9edf17d0f9",
206
+ "version_major": 2,
207
+ "version_minor": 0
208
+ },
209
+ "text/plain": [
210
+ "train-00006-of-00015.parquet: 0%| | 0.00/220M [00:00<?, ?B/s]"
211
+ ]
212
+ },
213
+ "metadata": {},
214
+ "output_type": "display_data"
215
+ },
216
+ {
217
+ "data": {
218
+ "application/vnd.jupyter.widget-view+json": {
219
+ "model_id": "1626f0e6390542298f9c22e659e95d52",
220
+ "version_major": 2,
221
+ "version_minor": 0
222
+ },
223
+ "text/plain": [
224
+ "train-00007-of-00015.parquet: 0%| | 0.00/210M [00:00<?, ?B/s]"
225
+ ]
226
+ },
227
+ "metadata": {},
228
+ "output_type": "display_data"
229
+ },
230
+ {
231
+ "data": {
232
+ "application/vnd.jupyter.widget-view+json": {
233
+ "model_id": "5c9d6d941a564add86f644daa4710850",
234
+ "version_major": 2,
235
+ "version_minor": 0
236
+ },
237
+ "text/plain": [
238
+ "train-00008-of-00015.parquet: 0%| | 0.00/215M [00:00<?, ?B/s]"
239
+ ]
240
+ },
241
+ "metadata": {},
242
+ "output_type": "display_data"
243
+ },
244
+ {
245
+ "data": {
246
+ "application/vnd.jupyter.widget-view+json": {
247
+ "model_id": "13b0fe00bca644d9b971793375c2bc3b",
248
+ "version_major": 2,
249
+ "version_minor": 0
250
+ },
251
+ "text/plain": [
252
+ "train-00009-of-00015.parquet: 0%| | 0.00/211M [00:00<?, ?B/s]"
253
+ ]
254
+ },
255
+ "metadata": {},
256
+ "output_type": "display_data"
257
+ },
258
+ {
259
+ "data": {
260
+ "application/vnd.jupyter.widget-view+json": {
261
+ "model_id": "60ff775f33b74bf6b9be369f4b38e116",
262
+ "version_major": 2,
263
+ "version_minor": 0
264
+ },
265
+ "text/plain": [
266
+ "train-00010-of-00015.parquet: 0%| | 0.00/181M [00:00<?, ?B/s]"
267
+ ]
268
+ },
269
+ "metadata": {},
270
+ "output_type": "display_data"
271
+ },
272
+ {
273
+ "data": {
274
+ "application/vnd.jupyter.widget-view+json": {
275
+ "model_id": "862390a8eff241b3b371e6db7404591a",
276
+ "version_major": 2,
277
+ "version_minor": 0
278
+ },
279
+ "text/plain": [
280
+ "train-00011-of-00015.parquet: 0%| | 0.00/197M [00:00<?, ?B/s]"
281
+ ]
282
+ },
283
+ "metadata": {},
284
+ "output_type": "display_data"
285
+ },
286
+ {
287
+ "data": {
288
+ "application/vnd.jupyter.widget-view+json": {
289
+ "model_id": "6162884ffd804cffb1f0d1e6a57e940f",
290
+ "version_major": 2,
291
+ "version_minor": 0
292
+ },
293
+ "text/plain": [
294
+ "train-00012-of-00015.parquet: 0%| | 0.00/180M [00:00<?, ?B/s]"
295
+ ]
296
+ },
297
+ "metadata": {},
298
+ "output_type": "display_data"
299
+ },
300
+ {
301
+ "data": {
302
+ "application/vnd.jupyter.widget-view+json": {
303
+ "model_id": "0d8e1f181e4043019e43704b685851f2",
304
+ "version_major": 2,
305
+ "version_minor": 0
306
+ },
307
+ "text/plain": [
308
+ "train-00013-of-00015.parquet: 0%| | 0.00/219M [00:00<?, ?B/s]"
309
+ ]
310
+ },
311
+ "metadata": {},
312
+ "output_type": "display_data"
313
+ },
314
+ {
315
+ "data": {
316
+ "application/vnd.jupyter.widget-view+json": {
317
+ "model_id": "fc7550b48dbe4d35b1c5e1c7ff52f0a8",
318
+ "version_major": 2,
319
+ "version_minor": 0
320
+ },
321
+ "text/plain": [
322
+ "train-00014-of-00015.parquet: 0%| | 0.00/220M [00:00<?, ?B/s]"
323
+ ]
324
+ },
325
+ "metadata": {},
326
+ "output_type": "display_data"
327
+ },
328
+ {
329
+ "data": {
330
+ "application/vnd.jupyter.widget-view+json": {
331
+ "model_id": "1d84178aa296430cb2e949c37fbde0e2",
332
+ "version_major": 2,
333
+ "version_minor": 0
334
+ },
335
+ "text/plain": [
336
+ "Generating train split: 0%| | 0/2402095 [00:00<?, ? examples/s]"
337
+ ]
338
+ },
339
+ "metadata": {},
340
+ "output_type": "display_data"
341
+ },
342
+ {
343
+ "data": {
344
+ "text/plain": [
345
+ "DatasetDict({\n",
346
+ " train: Dataset({\n",
347
+ " features: ['id', 'url', 'title', 'text'],\n",
348
+ " num_rows: 2402095\n",
349
+ " })\n",
350
+ "})"
351
+ ]
352
+ },
353
+ "execution_count": 5,
354
+ "metadata": {},
355
+ "output_type": "execute_result"
356
+ }
357
+ ],
358
+ "source": [
359
+ "datasets_wiki = load_dataset(\"legacy-datasets/wikipedia\", \"20220301.fr\")\n",
360
+ "datasets_wiki"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 7,
366
+ "id": "ac610da1-9529-491a-a9f4-5d6f3a92cce7",
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "wiki = open(\"fr_wiki.txt\",\"w\")\n",
371
+ "for item in datasets_wiki[\"train\"]:\n",
372
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
373
+ " wiki.write(line)\n",
374
+ "wiki.close()"
375
+ ]
376
+ },
377
+ {
378
+ "cell_type": "code",
379
+ "execution_count": null,
380
+ "id": "8be4a828-affd-4b07-8821-16cbefb93bd6",
381
+ "metadata": {},
382
+ "outputs": [],
383
+ "source": []
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 8,
388
+ "id": "2a4a8425-8615-45c5-84cf-fcfb6a54727a",
389
+ "metadata": {},
390
+ "outputs": [
391
+ {
392
+ "data": {
393
+ "application/vnd.jupyter.widget-view+json": {
394
+ "model_id": "1ade008c4100486e9e47a52fadc1ece7",
395
+ "version_major": 2,
396
+ "version_minor": 0
397
+ },
398
+ "text/plain": [
399
+ "Downloading data: 0%| | 0/18 [00:00<?, ?files/s]"
400
+ ]
401
+ },
402
+ "metadata": {},
403
+ "output_type": "display_data"
404
+ },
405
+ {
406
+ "data": {
407
+ "application/vnd.jupyter.widget-view+json": {
408
+ "model_id": "bbe51753a90240d399d26dfd0bdb1aa6",
409
+ "version_major": 2,
410
+ "version_minor": 0
411
+ },
412
+ "text/plain": [
413
+ "train-00010-of-00018.parquet: 0%| | 0.00/239M [00:00<?, ?B/s]"
414
+ ]
415
+ },
416
+ "metadata": {},
417
+ "output_type": "display_data"
418
+ },
419
+ {
420
+ "data": {
421
+ "application/vnd.jupyter.widget-view+json": {
422
+ "model_id": "717f471eccca49e5a415f8b8b8c9c6e3",
423
+ "version_major": 2,
424
+ "version_minor": 0
425
+ },
426
+ "text/plain": [
427
+ "train-00011-of-00018.parquet: 0%| | 0.00/252M [00:00<?, ?B/s]"
428
+ ]
429
+ },
430
+ "metadata": {},
431
+ "output_type": "display_data"
432
+ },
433
+ {
434
+ "data": {
435
+ "application/vnd.jupyter.widget-view+json": {
436
+ "model_id": "1ea2609792144411afa2a855e2ff1c7d",
437
+ "version_major": 2,
438
+ "version_minor": 0
439
+ },
440
+ "text/plain": [
441
+ "train-00005-of-00018.parquet: 0%| | 0.00/283M [00:00<?, ?B/s]"
442
+ ]
443
+ },
444
+ "metadata": {},
445
+ "output_type": "display_data"
446
+ },
447
+ {
448
+ "data": {
449
+ "application/vnd.jupyter.widget-view+json": {
450
+ "model_id": "a8c88a13fd5d475085649381fe2fbba7",
451
+ "version_major": 2,
452
+ "version_minor": 0
453
+ },
454
+ "text/plain": [
455
+ "train-00008-of-00018.parquet: 0%| | 0.00/241M [00:00<?, ?B/s]"
456
+ ]
457
+ },
458
+ "metadata": {},
459
+ "output_type": "display_data"
460
+ },
461
+ {
462
+ "data": {
463
+ "application/vnd.jupyter.widget-view+json": {
464
+ "model_id": "0b63f5caf6c54225b35a3bdc915cc73e",
465
+ "version_major": 2,
466
+ "version_minor": 0
467
+ },
468
+ "text/plain": [
469
+ "train-00000-of-00018.parquet: 0%| | 0.00/773M [00:00<?, ?B/s]"
470
+ ]
471
+ },
472
+ "metadata": {},
473
+ "output_type": "display_data"
474
+ },
475
+ {
476
+ "data": {
477
+ "application/vnd.jupyter.widget-view+json": {
478
+ "model_id": "de0f6e9973ab427eaa937924b1134029",
479
+ "version_major": 2,
480
+ "version_minor": 0
481
+ },
482
+ "text/plain": [
483
+ "train-00001-of-00018.parquet: 0%| | 0.00/446M [00:00<?, ?B/s]"
484
+ ]
485
+ },
486
+ "metadata": {},
487
+ "output_type": "display_data"
488
+ },
489
+ {
490
+ "data": {
491
+ "application/vnd.jupyter.widget-view+json": {
492
+ "model_id": "beac8579da8244deadc2da23aec6ec6e",
493
+ "version_major": 2,
494
+ "version_minor": 0
495
+ },
496
+ "text/plain": [
497
+ "train-00015-of-00018.parquet: 0%| | 0.00/224M [00:00<?, ?B/s]"
498
+ ]
499
+ },
500
+ "metadata": {},
501
+ "output_type": "display_data"
502
+ },
503
+ {
504
+ "data": {
505
+ "application/vnd.jupyter.widget-view+json": {
506
+ "model_id": "293b420636b44bab986752b92c91a1b3",
507
+ "version_major": 2,
508
+ "version_minor": 0
509
+ },
510
+ "text/plain": [
511
+ "train-00009-of-00018.parquet: 0%| | 0.00/236M [00:00<?, ?B/s]"
512
+ ]
513
+ },
514
+ "metadata": {},
515
+ "output_type": "display_data"
516
+ },
517
+ {
518
+ "data": {
519
+ "application/vnd.jupyter.widget-view+json": {
520
+ "model_id": "d3d66b5d72ac472aa3ba20dae34f9695",
521
+ "version_major": 2,
522
+ "version_minor": 0
523
+ },
524
+ "text/plain": [
525
+ "train-00012-of-00018.parquet: 0%| | 0.00/245M [00:00<?, ?B/s]"
526
+ ]
527
+ },
528
+ "metadata": {},
529
+ "output_type": "display_data"
530
+ },
531
+ {
532
+ "data": {
533
+ "application/vnd.jupyter.widget-view+json": {
534
+ "model_id": "3be8b392a6614472b005ba9b9b1e1d59",
535
+ "version_major": 2,
536
+ "version_minor": 0
537
+ },
538
+ "text/plain": [
539
+ "train-00006-of-00018.parquet: 0%| | 0.00/277M [00:00<?, ?B/s]"
540
+ ]
541
+ },
542
+ "metadata": {},
543
+ "output_type": "display_data"
544
+ },
545
+ {
546
+ "data": {
547
+ "application/vnd.jupyter.widget-view+json": {
548
+ "model_id": "b371f42b410e4cbf9137666b4bac2647",
549
+ "version_major": 2,
550
+ "version_minor": 0
551
+ },
552
+ "text/plain": [
553
+ "train-00007-of-00018.parquet: 0%| | 0.00/256M [00:00<?, ?B/s]"
554
+ ]
555
+ },
556
+ "metadata": {},
557
+ "output_type": "display_data"
558
+ },
559
+ {
560
+ "data": {
561
+ "application/vnd.jupyter.widget-view+json": {
562
+ "model_id": "a9ab370c0f534b289fdac4ed3cddff3a",
563
+ "version_major": 2,
564
+ "version_minor": 0
565
+ },
566
+ "text/plain": [
567
+ "train-00002-of-00018.parquet: 0%| | 0.00/369M [00:00<?, ?B/s]"
568
+ ]
569
+ },
570
+ "metadata": {},
571
+ "output_type": "display_data"
572
+ },
573
+ {
574
+ "data": {
575
+ "application/vnd.jupyter.widget-view+json": {
576
+ "model_id": "dce437fe46e04c82b2f8ab7a3121592b",
577
+ "version_major": 2,
578
+ "version_minor": 0
579
+ },
580
+ "text/plain": [
581
+ "train-00003-of-00018.parquet: 0%| | 0.00/291M [00:00<?, ?B/s]"
582
+ ]
583
+ },
584
+ "metadata": {},
585
+ "output_type": "display_data"
586
+ },
587
+ {
588
+ "data": {
589
+ "application/vnd.jupyter.widget-view+json": {
590
+ "model_id": "5bec466db34b4daa926a6eef48273817",
591
+ "version_major": 2,
592
+ "version_minor": 0
593
+ },
594
+ "text/plain": [
595
+ "train-00014-of-00018.parquet: 0%| | 0.00/233M [00:00<?, ?B/s]"
596
+ ]
597
+ },
598
+ "metadata": {},
599
+ "output_type": "display_data"
600
+ },
601
+ {
602
+ "data": {
603
+ "application/vnd.jupyter.widget-view+json": {
604
+ "model_id": "0e47f796b69749ef8f3bf255300553ec",
605
+ "version_major": 2,
606
+ "version_minor": 0
607
+ },
608
+ "text/plain": [
609
+ "train-00004-of-00018.parquet: 0%| | 0.00/300M [00:00<?, ?B/s]"
610
+ ]
611
+ },
612
+ "metadata": {},
613
+ "output_type": "display_data"
614
+ },
615
+ {
616
+ "data": {
617
+ "application/vnd.jupyter.widget-view+json": {
618
+ "model_id": "b6dd9bf441d24ffa886996c210b4607f",
619
+ "version_major": 2,
620
+ "version_minor": 0
621
+ },
622
+ "text/plain": [
623
+ "train-00013-of-00018.parquet: 0%| | 0.00/233M [00:00<?, ?B/s]"
624
+ ]
625
+ },
626
+ "metadata": {},
627
+ "output_type": "display_data"
628
+ },
629
+ {
630
+ "data": {
631
+ "application/vnd.jupyter.widget-view+json": {
632
+ "model_id": "0fa5351d1be542b88a3024b900b0ef19",
633
+ "version_major": 2,
634
+ "version_minor": 0
635
+ },
636
+ "text/plain": [
637
+ "train-00016-of-00018.parquet: 0%| | 0.00/224M [00:00<?, ?B/s]"
638
+ ]
639
+ },
640
+ "metadata": {},
641
+ "output_type": "display_data"
642
+ },
643
+ {
644
+ "data": {
645
+ "application/vnd.jupyter.widget-view+json": {
646
+ "model_id": "1452efc3d5564ef99ae336dbcc36b22f",
647
+ "version_major": 2,
648
+ "version_minor": 0
649
+ },
650
+ "text/plain": [
651
+ "train-00017-of-00018.parquet: 0%| | 0.00/222M [00:00<?, ?B/s]"
652
+ ]
653
+ },
654
+ "metadata": {},
655
+ "output_type": "display_data"
656
+ },
657
+ {
658
+ "data": {
659
+ "application/vnd.jupyter.widget-view+json": {
660
+ "model_id": "c12fa5a2d21241c6a31b4c889999ad0a",
661
+ "version_major": 2,
662
+ "version_minor": 0
663
+ },
664
+ "text/plain": [
665
+ "Generating train split: 0%| | 0/2665357 [00:00<?, ? examples/s]"
666
+ ]
667
+ },
668
+ "metadata": {},
669
+ "output_type": "display_data"
670
+ },
671
+ {
672
+ "data": {
673
+ "application/vnd.jupyter.widget-view+json": {
674
+ "model_id": "ca6b84a24ecf4e00bf8e550ba5eb5054",
675
+ "version_major": 2,
676
+ "version_minor": 0
677
+ },
678
+ "text/plain": [
679
+ "Loading dataset shards: 0%| | 0/18 [00:00<?, ?it/s]"
680
+ ]
681
+ },
682
+ "metadata": {},
683
+ "output_type": "display_data"
684
+ },
685
+ {
686
+ "data": {
687
+ "text/plain": [
688
+ "DatasetDict({\n",
689
+ " train: Dataset({\n",
690
+ " features: ['id', 'url', 'title', 'text'],\n",
691
+ " num_rows: 2665357\n",
692
+ " })\n",
693
+ "})"
694
+ ]
695
+ },
696
+ "execution_count": 8,
697
+ "metadata": {},
698
+ "output_type": "execute_result"
699
+ }
700
+ ],
701
+ "source": [
702
+ "datasets_wiki = load_dataset(\"legacy-datasets/wikipedia\", \"20220301.de\")\n",
703
+ "datasets_wiki"
704
+ ]
705
+ },
706
+ {
707
+ "cell_type": "code",
708
+ "execution_count": 9,
709
+ "id": "1c60e53d-3031-44e7-b1da-edc419079f13",
710
+ "metadata": {},
711
+ "outputs": [],
712
+ "source": [
713
+ "wiki = open(\"de_wiki.txt\",\"w\")\n",
714
+ "for item in datasets_wiki[\"train\"]:\n",
715
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
716
+ " wiki.write(line)\n",
717
+ "wiki.close()"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "execution_count": null,
723
+ "id": "c42100db-79d9-4cbe-9582-e43d58567173",
724
+ "metadata": {},
725
+ "outputs": [],
726
+ "source": []
727
+ },
728
+ {
729
+ "cell_type": "code",
730
+ "execution_count": 12,
731
+ "id": "eee3698d-ccb8-4cec-9796-e93886fd6b1e",
732
+ "metadata": {},
733
+ "outputs": [
734
+ {
735
+ "data": {
736
+ "application/vnd.jupyter.widget-view+json": {
737
+ "model_id": "a03b4c97410645eaaae45f25e1c03108",
738
+ "version_major": 2,
739
+ "version_minor": 0
740
+ },
741
+ "text/plain": [
742
+ "train-00000-of-00015.parquet: 0%| | 0.00/612M [00:00<?, ?B/s]"
743
+ ]
744
+ },
745
+ "metadata": {},
746
+ "output_type": "display_data"
747
+ },
748
+ {
749
+ "data": {
750
+ "application/vnd.jupyter.widget-view+json": {
751
+ "model_id": "5a03f25cd8354e3299b82554c0fc3287",
752
+ "version_major": 2,
753
+ "version_minor": 0
754
+ },
755
+ "text/plain": [
756
+ "train-00001-of-00015.parquet: 0%| | 0.00/383M [00:00<?, ?B/s]"
757
+ ]
758
+ },
759
+ "metadata": {},
760
+ "output_type": "display_data"
761
+ },
762
+ {
763
+ "data": {
764
+ "application/vnd.jupyter.widget-view+json": {
765
+ "model_id": "8a63110f3cf648eda442ee3177aee2f6",
766
+ "version_major": 2,
767
+ "version_minor": 0
768
+ },
769
+ "text/plain": [
770
+ "train-00002-of-00015.parquet: 0%| | 0.00/305M [00:00<?, ?B/s]"
771
+ ]
772
+ },
773
+ "metadata": {},
774
+ "output_type": "display_data"
775
+ },
776
+ {
777
+ "data": {
778
+ "application/vnd.jupyter.widget-view+json": {
779
+ "model_id": "09ec9ce700084abcb944f6b6669ebf15",
780
+ "version_major": 2,
781
+ "version_minor": 0
782
+ },
783
+ "text/plain": [
784
+ "train-00003-of-00015.parquet: 0%| | 0.00/252M [00:00<?, ?B/s]"
785
+ ]
786
+ },
787
+ "metadata": {},
788
+ "output_type": "display_data"
789
+ },
790
+ {
791
+ "data": {
792
+ "application/vnd.jupyter.widget-view+json": {
793
+ "model_id": "363a3b6272c24e23a0e708864c872310",
794
+ "version_major": 2,
795
+ "version_minor": 0
796
+ },
797
+ "text/plain": [
798
+ "train-00004-of-00015.parquet: 0%| | 0.00/240M [00:00<?, ?B/s]"
799
+ ]
800
+ },
801
+ "metadata": {},
802
+ "output_type": "display_data"
803
+ },
804
+ {
805
+ "data": {
806
+ "application/vnd.jupyter.widget-view+json": {
807
+ "model_id": "58b7f596d96246439e7cc21acc333fb4",
808
+ "version_major": 2,
809
+ "version_minor": 0
810
+ },
811
+ "text/plain": [
812
+ "train-00005-of-00015.parquet: 0%| | 0.00/240M [00:00<?, ?B/s]"
813
+ ]
814
+ },
815
+ "metadata": {},
816
+ "output_type": "display_data"
817
+ },
818
+ {
819
+ "data": {
820
+ "application/vnd.jupyter.widget-view+json": {
821
+ "model_id": "2872cc14f4944b4894bd05bbb256d892",
822
+ "version_major": 2,
823
+ "version_minor": 0
824
+ },
825
+ "text/plain": [
826
+ "train-00006-of-00015.parquet: 0%| | 0.00/233M [00:00<?, ?B/s]"
827
+ ]
828
+ },
829
+ "metadata": {},
830
+ "output_type": "display_data"
831
+ },
832
+ {
833
+ "data": {
834
+ "application/vnd.jupyter.widget-view+json": {
835
+ "model_id": "3d65c3b76a02445e85d6723ea19565c2",
836
+ "version_major": 2,
837
+ "version_minor": 0
838
+ },
839
+ "text/plain": [
840
+ "train-00007-of-00015.parquet: 0%| | 0.00/218M [00:00<?, ?B/s]"
841
+ ]
842
+ },
843
+ "metadata": {},
844
+ "output_type": "display_data"
845
+ },
846
+ {
847
+ "data": {
848
+ "application/vnd.jupyter.widget-view+json": {
849
+ "model_id": "a77dd5822bd34896b4dbd4d22cb8331c",
850
+ "version_major": 2,
851
+ "version_minor": 0
852
+ },
853
+ "text/plain": [
854
+ "train-00008-of-00015.parquet: 0%| | 0.00/224M [00:00<?, ?B/s]"
855
+ ]
856
+ },
857
+ "metadata": {},
858
+ "output_type": "display_data"
859
+ },
860
+ {
861
+ "data": {
862
+ "application/vnd.jupyter.widget-view+json": {
863
+ "model_id": "52ce12211c8743acac275af2c7ac049c",
864
+ "version_major": 2,
865
+ "version_minor": 0
866
+ },
867
+ "text/plain": [
868
+ "train-00009-of-00015.parquet: 0%| | 0.00/235M [00:00<?, ?B/s]"
869
+ ]
870
+ },
871
+ "metadata": {},
872
+ "output_type": "display_data"
873
+ },
874
+ {
875
+ "data": {
876
+ "application/vnd.jupyter.widget-view+json": {
877
+ "model_id": "9da3788934f14b15a63e4add38e989b4",
878
+ "version_major": 2,
879
+ "version_minor": 0
880
+ },
881
+ "text/plain": [
882
+ "train-00010-of-00015.parquet: 0%| | 0.00/217M [00:00<?, ?B/s]"
883
+ ]
884
+ },
885
+ "metadata": {},
886
+ "output_type": "display_data"
887
+ },
888
+ {
889
+ "data": {
890
+ "application/vnd.jupyter.widget-view+json": {
891
+ "model_id": "06254bec73a24830b685468235260f0c",
892
+ "version_major": 2,
893
+ "version_minor": 0
894
+ },
895
+ "text/plain": [
896
+ "train-00011-of-00015.parquet: 0%| | 0.00/213M [00:00<?, ?B/s]"
897
+ ]
898
+ },
899
+ "metadata": {},
900
+ "output_type": "display_data"
901
+ },
902
+ {
903
+ "data": {
904
+ "application/vnd.jupyter.widget-view+json": {
905
+ "model_id": "d25a6d36b72446458525dd060d63102a",
906
+ "version_major": 2,
907
+ "version_minor": 0
908
+ },
909
+ "text/plain": [
910
+ "train-00012-of-00015.parquet: 0%| | 0.00/201M [00:00<?, ?B/s]"
911
+ ]
912
+ },
913
+ "metadata": {},
914
+ "output_type": "display_data"
915
+ },
916
+ {
917
+ "data": {
918
+ "application/vnd.jupyter.widget-view+json": {
919
+ "model_id": "4da023af77894a72a1e6223c91820983",
920
+ "version_major": 2,
921
+ "version_minor": 0
922
+ },
923
+ "text/plain": [
924
+ "train-00013-of-00015.parquet: 0%| | 0.00/193M [00:00<?, ?B/s]"
925
+ ]
926
+ },
927
+ "metadata": {},
928
+ "output_type": "display_data"
929
+ },
930
+ {
931
+ "data": {
932
+ "application/vnd.jupyter.widget-view+json": {
933
+ "model_id": "6c6d63c3e4df46c69da6b42c8b7eae4a",
934
+ "version_major": 2,
935
+ "version_minor": 0
936
+ },
937
+ "text/plain": [
938
+ "train-00014-of-00015.parquet: 0%| | 0.00/176M [00:00<?, ?B/s]"
939
+ ]
940
+ },
941
+ "metadata": {},
942
+ "output_type": "display_data"
943
+ },
944
+ {
945
+ "data": {
946
+ "application/vnd.jupyter.widget-view+json": {
947
+ "model_id": "f98c43bad8a140cca38d7c00232f884a",
948
+ "version_major": 2,
949
+ "version_minor": 0
950
+ },
951
+ "text/plain": [
952
+ "Generating train split: 0%| | 0/1389467 [00:00<?, ? examples/s]"
953
+ ]
954
+ },
955
+ "metadata": {},
956
+ "output_type": "display_data"
957
+ },
958
+ {
959
+ "data": {
960
+ "text/plain": [
961
+ "DatasetDict({\n",
962
+ " train: Dataset({\n",
963
+ " features: ['id', 'url', 'title', 'text'],\n",
964
+ " num_rows: 1389467\n",
965
+ " })\n",
966
+ "})"
967
+ ]
968
+ },
969
+ "execution_count": 12,
970
+ "metadata": {},
971
+ "output_type": "execute_result"
972
+ }
973
+ ],
974
+ "source": [
975
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.ja\")\n",
976
+ "datasets_wiki"
977
+ ]
978
+ },
979
+ {
980
+ "cell_type": "code",
981
+ "execution_count": 14,
982
+ "id": "4b61a5b0-85ca-4a0e-99b0-0ea2e3bc9e82",
983
+ "metadata": {},
984
+ "outputs": [],
985
+ "source": [
986
+ "wiki = open(\"ja_wiki.txt\",\"w\")\n",
987
+ "for item in datasets_wiki[\"train\"]:\n",
988
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
989
+ " wiki.write(line)\n",
990
+ "wiki.close()"
991
+ ]
992
+ },
993
+ {
994
+ "cell_type": "code",
995
+ "execution_count": null,
996
+ "id": "5c56f7fd-8188-4d30-928d-e307764f23f9",
997
+ "metadata": {},
998
+ "outputs": [],
999
+ "source": []
1000
+ },
1001
+ {
1002
+ "cell_type": "code",
1003
+ "execution_count": 15,
1004
+ "id": "1668c12c-cbfe-4ea8-82c6-4e27607e9b67",
1005
+ "metadata": {},
1006
+ "outputs": [
1007
+ {
1008
+ "data": {
1009
+ "application/vnd.jupyter.widget-view+json": {
1010
+ "model_id": "8f8cd7bb3992440c89087f809b96832d",
1011
+ "version_major": 2,
1012
+ "version_minor": 0
1013
+ },
1014
+ "text/plain": [
1015
+ "train-00000-of-00013.parquet: 0%| | 0.00/688M [00:00<?, ?B/s]"
1016
+ ]
1017
+ },
1018
+ "metadata": {},
1019
+ "output_type": "display_data"
1020
+ },
1021
+ {
1022
+ "data": {
1023
+ "application/vnd.jupyter.widget-view+json": {
1024
+ "model_id": "b6c6f856ff3f4070a26e56bfacca6deb",
1025
+ "version_major": 2,
1026
+ "version_minor": 0
1027
+ },
1028
+ "text/plain": [
1029
+ "train-00001-of-00013.parquet: 0%| | 0.00/376M [00:00<?, ?B/s]"
1030
+ ]
1031
+ },
1032
+ "metadata": {},
1033
+ "output_type": "display_data"
1034
+ },
1035
+ {
1036
+ "data": {
1037
+ "application/vnd.jupyter.widget-view+json": {
1038
+ "model_id": "d90dc9bb27ef418e906109adf3e7fad3",
1039
+ "version_major": 2,
1040
+ "version_minor": 0
1041
+ },
1042
+ "text/plain": [
1043
+ "train-00002-of-00013.parquet: 0%| | 0.00/287M [00:00<?, ?B/s]"
1044
+ ]
1045
+ },
1046
+ "metadata": {},
1047
+ "output_type": "display_data"
1048
+ },
1049
+ {
1050
+ "data": {
1051
+ "application/vnd.jupyter.widget-view+json": {
1052
+ "model_id": "e72006a2edd24c3a9bd88a0e2ac57d7b",
1053
+ "version_major": 2,
1054
+ "version_minor": 0
1055
+ },
1056
+ "text/plain": [
1057
+ "train-00003-of-00013.parquet: 0%| | 0.00/245M [00:00<?, ?B/s]"
1058
+ ]
1059
+ },
1060
+ "metadata": {},
1061
+ "output_type": "display_data"
1062
+ },
1063
+ {
1064
+ "data": {
1065
+ "application/vnd.jupyter.widget-view+json": {
1066
+ "model_id": "df667cedb5374cd39e86e52c6cea3a7a",
1067
+ "version_major": 2,
1068
+ "version_minor": 0
1069
+ },
1070
+ "text/plain": [
1071
+ "train-00004-of-00013.parquet: 0%| | 0.00/168M [00:00<?, ?B/s]"
1072
+ ]
1073
+ },
1074
+ "metadata": {},
1075
+ "output_type": "display_data"
1076
+ },
1077
+ {
1078
+ "data": {
1079
+ "application/vnd.jupyter.widget-view+json": {
1080
+ "model_id": "6b005307539743769e0ac974c2996d9b",
1081
+ "version_major": 2,
1082
+ "version_minor": 0
1083
+ },
1084
+ "text/plain": [
1085
+ "train-00005-of-00013.parquet: 0%| | 0.00/178M [00:00<?, ?B/s]"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ },
1091
+ {
1092
+ "data": {
1093
+ "application/vnd.jupyter.widget-view+json": {
1094
+ "model_id": "c749d7a330f24d2fa684857be0700cc3",
1095
+ "version_major": 2,
1096
+ "version_minor": 0
1097
+ },
1098
+ "text/plain": [
1099
+ "train-00006-of-00013.parquet: 0%| | 0.00/216M [00:00<?, ?B/s]"
1100
+ ]
1101
+ },
1102
+ "metadata": {},
1103
+ "output_type": "display_data"
1104
+ },
1105
+ {
1106
+ "data": {
1107
+ "application/vnd.jupyter.widget-view+json": {
1108
+ "model_id": "7b07061ea4694ba8af395eeae31bd1fd",
1109
+ "version_major": 2,
1110
+ "version_minor": 0
1111
+ },
1112
+ "text/plain": [
1113
+ "train-00007-of-00013.parquet: 0%| | 0.00/241M [00:00<?, ?B/s]"
1114
+ ]
1115
+ },
1116
+ "metadata": {},
1117
+ "output_type": "display_data"
1118
+ },
1119
+ {
1120
+ "data": {
1121
+ "application/vnd.jupyter.widget-view+json": {
1122
+ "model_id": "0e07a225a3f24d9a9c92ad9ee153b066",
1123
+ "version_major": 2,
1124
+ "version_minor": 0
1125
+ },
1126
+ "text/plain": [
1127
+ "train-00008-of-00013.parquet: 0%| | 0.00/227M [00:00<?, ?B/s]"
1128
+ ]
1129
+ },
1130
+ "metadata": {},
1131
+ "output_type": "display_data"
1132
+ },
1133
+ {
1134
+ "data": {
1135
+ "application/vnd.jupyter.widget-view+json": {
1136
+ "model_id": "3054fd1470a34bbab1c6dee28dcb2441",
1137
+ "version_major": 2,
1138
+ "version_minor": 0
1139
+ },
1140
+ "text/plain": [
1141
+ "train-00009-of-00013.parquet: 0%| | 0.00/223M [00:00<?, ?B/s]"
1142
+ ]
1143
+ },
1144
+ "metadata": {},
1145
+ "output_type": "display_data"
1146
+ },
1147
+ {
1148
+ "data": {
1149
+ "application/vnd.jupyter.widget-view+json": {
1150
+ "model_id": "90f7a09fb95b4c4a926711a258c32107",
1151
+ "version_major": 2,
1152
+ "version_minor": 0
1153
+ },
1154
+ "text/plain": [
1155
+ "train-00010-of-00013.parquet: 0%| | 0.00/167M [00:00<?, ?B/s]"
1156
+ ]
1157
+ },
1158
+ "metadata": {},
1159
+ "output_type": "display_data"
1160
+ },
1161
+ {
1162
+ "data": {
1163
+ "application/vnd.jupyter.widget-view+json": {
1164
+ "model_id": "f2ceb836cfca47e2ba75659dacfc9739",
1165
+ "version_major": 2,
1166
+ "version_minor": 0
1167
+ },
1168
+ "text/plain": [
1169
+ "train-00011-of-00013.parquet: 0%| | 0.00/254M [00:00<?, ?B/s]"
1170
+ ]
1171
+ },
1172
+ "metadata": {},
1173
+ "output_type": "display_data"
1174
+ },
1175
+ {
1176
+ "data": {
1177
+ "application/vnd.jupyter.widget-view+json": {
1178
+ "model_id": "b20a7cea4fb146a5810cde198ef60ab6",
1179
+ "version_major": 2,
1180
+ "version_minor": 0
1181
+ },
1182
+ "text/plain": [
1183
+ "train-00012-of-00013.parquet: 0%| | 0.00/226M [00:00<?, ?B/s]"
1184
+ ]
1185
+ },
1186
+ "metadata": {},
1187
+ "output_type": "display_data"
1188
+ },
1189
+ {
1190
+ "data": {
1191
+ "application/vnd.jupyter.widget-view+json": {
1192
+ "model_id": "ba34e3fba4be480cbf145281eb834d64",
1193
+ "version_major": 2,
1194
+ "version_minor": 0
1195
+ },
1196
+ "text/plain": [
1197
+ "Generating train split: 0%| | 0/1841155 [00:00<?, ? examples/s]"
1198
+ ]
1199
+ },
1200
+ "metadata": {},
1201
+ "output_type": "display_data"
1202
+ },
1203
+ {
1204
+ "data": {
1205
+ "text/plain": [
1206
+ "DatasetDict({\n",
1207
+ " train: Dataset({\n",
1208
+ " features: ['id', 'url', 'title', 'text'],\n",
1209
+ " num_rows: 1841155\n",
1210
+ " })\n",
1211
+ "})"
1212
+ ]
1213
+ },
1214
+ "execution_count": 15,
1215
+ "metadata": {},
1216
+ "output_type": "execute_result"
1217
+ }
1218
+ ],
1219
+ "source": [
1220
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.es\")\n",
1221
+ "datasets_wiki"
1222
+ ]
1223
+ },
1224
+ {
1225
+ "cell_type": "code",
1226
+ "execution_count": 16,
1227
+ "id": "9b3d2f1d-47c3-4a4d-8f60-cf5874ec643b",
1228
+ "metadata": {},
1229
+ "outputs": [],
1230
+ "source": [
1231
+ "wiki = open(\"es_wiki.txt\",\"w\")\n",
1232
+ "for item in datasets_wiki[\"train\"]:\n",
1233
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
1234
+ " wiki.write(line)\n",
1235
+ "wiki.close()"
1236
+ ]
1237
+ },
1238
+ {
1239
+ "cell_type": "code",
1240
+ "execution_count": null,
1241
+ "id": "4d300900-2d36-4f21-a799-17b466239666",
1242
+ "metadata": {},
1243
+ "outputs": [],
1244
+ "source": []
1245
+ },
1246
+ {
1247
+ "cell_type": "code",
1248
+ "execution_count": 17,
1249
+ "id": "1d20d238-3015-443f-956a-ca1a7c622001",
1250
+ "metadata": {},
1251
+ "outputs": [
1252
+ {
1253
+ "data": {
1254
+ "text/plain": [
1255
+ "DatasetDict({\n",
1256
+ " train: Dataset({\n",
1257
+ " features: ['id', 'url', 'title', 'text'],\n",
1258
+ " num_rows: 1384748\n",
1259
+ " })\n",
1260
+ "})"
1261
+ ]
1262
+ },
1263
+ "execution_count": 17,
1264
+ "metadata": {},
1265
+ "output_type": "execute_result"
1266
+ }
1267
+ ],
1268
+ "source": [
1269
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.zh\")\n",
1270
+ "datasets_wiki"
1271
+ ]
1272
+ },
1273
+ {
1274
+ "cell_type": "code",
1275
+ "execution_count": 18,
1276
+ "id": "eac16b65-44d1-43ae-9aeb-c06a470daf0e",
1277
+ "metadata": {},
1278
+ "outputs": [],
1279
+ "source": [
1280
+ "wiki = open(\"zh_wiki.txt\",\"w\")\n",
1281
+ "for item in datasets_wiki[\"train\"]:\n",
1282
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
1283
+ " wiki.write(line)\n",
1284
+ "wiki.close()"
1285
+ ]
1286
+ },
1287
+ {
1288
+ "cell_type": "code",
1289
+ "execution_count": null,
1290
+ "id": "67c13fd3-ce11-41ce-8ca9-7d1d783310fb",
1291
+ "metadata": {},
1292
+ "outputs": [],
1293
+ "source": []
1294
+ },
1295
+ {
1296
+ "cell_type": "code",
1297
+ "execution_count": 19,
1298
+ "id": "8d4f223d-accc-44f5-acf3-5eca7621e960",
1299
+ "metadata": {},
1300
+ "outputs": [
1301
+ {
1302
+ "data": {
1303
+ "application/vnd.jupyter.widget-view+json": {
1304
+ "model_id": "2b39607ab45b4449b96d05858de65c90",
1305
+ "version_major": 2,
1306
+ "version_minor": 0
1307
+ },
1308
+ "text/plain": [
1309
+ "train-00000-of-00003.parquet: 0%| | 0.00/400M [00:00<?, ?B/s]"
1310
+ ]
1311
+ },
1312
+ "metadata": {},
1313
+ "output_type": "display_data"
1314
+ },
1315
+ {
1316
+ "data": {
1317
+ "application/vnd.jupyter.widget-view+json": {
1318
+ "model_id": "c930ca1ff58a4ba2995158c1944335c2",
1319
+ "version_major": 2,
1320
+ "version_minor": 0
1321
+ },
1322
+ "text/plain": [
1323
+ "train-00001-of-00003.parquet: 0%| | 0.00/205M [00:00<?, ?B/s]"
1324
+ ]
1325
+ },
1326
+ "metadata": {},
1327
+ "output_type": "display_data"
1328
+ },
1329
+ {
1330
+ "data": {
1331
+ "application/vnd.jupyter.widget-view+json": {
1332
+ "model_id": "37654989816342af9a355a2cb6e770b4",
1333
+ "version_major": 2,
1334
+ "version_minor": 0
1335
+ },
1336
+ "text/plain": [
1337
+ "train-00002-of-00003.parquet: 0%| | 0.00/177M [00:00<?, ?B/s]"
1338
+ ]
1339
+ },
1340
+ "metadata": {},
1341
+ "output_type": "display_data"
1342
+ },
1343
+ {
1344
+ "data": {
1345
+ "application/vnd.jupyter.widget-view+json": {
1346
+ "model_id": "b0dbd13bc9dc457eae92438fd63947ae",
1347
+ "version_major": 2,
1348
+ "version_minor": 0
1349
+ },
1350
+ "text/plain": [
1351
+ "Generating train split: 0%| | 0/647897 [00:00<?, ? examples/s]"
1352
+ ]
1353
+ },
1354
+ "metadata": {},
1355
+ "output_type": "display_data"
1356
+ },
1357
+ {
1358
+ "data": {
1359
+ "text/plain": [
1360
+ "DatasetDict({\n",
1361
+ " train: Dataset({\n",
1362
+ " features: ['id', 'url', 'title', 'text'],\n",
1363
+ " num_rows: 647897\n",
1364
+ " })\n",
1365
+ "})"
1366
+ ]
1367
+ },
1368
+ "execution_count": 19,
1369
+ "metadata": {},
1370
+ "output_type": "execute_result"
1371
+ }
1372
+ ],
1373
+ "source": [
1374
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.ko\")\n",
1375
+ "datasets_wiki"
1376
+ ]
1377
+ },
1378
+ {
1379
+ "cell_type": "code",
1380
+ "execution_count": 21,
1381
+ "id": "1af1e848-559a-4882-a3ce-966a63edac44",
1382
+ "metadata": {},
1383
+ "outputs": [],
1384
+ "source": [
1385
+ "wiki = open(\"ko_wiki.txt\",\"w\")\n",
1386
+ "for item in datasets_wiki[\"train\"]:\n",
1387
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
1388
+ " wiki.write(line)\n",
1389
+ "wiki.close()"
1390
+ ]
1391
+ },
1392
+ {
1393
+ "cell_type": "code",
1394
+ "execution_count": null,
1395
+ "id": "e4b3810a-f564-4a4e-8ccc-5f23f8541414",
1396
+ "metadata": {},
1397
+ "outputs": [],
1398
+ "source": []
1399
+ }
1400
+ ],
1401
+ "metadata": {
1402
+ "kernelspec": {
1403
+ "display_name": "Python 3 (ipykernel)",
1404
+ "language": "python",
1405
+ "name": "python3"
1406
+ },
1407
+ "language_info": {
1408
+ "codemirror_mode": {
1409
+ "name": "ipython",
1410
+ "version": 3
1411
+ },
1412
+ "file_extension": ".py",
1413
+ "mimetype": "text/x-python",
1414
+ "name": "python",
1415
+ "nbconvert_exporter": "python",
1416
+ "pygments_lexer": "ipython3",
1417
+ "version": "3.12.3"
1418
+ }
1419
+ },
1420
+ "nbformat": 4,
1421
+ "nbformat_minor": 5
1422
+ }
get_data/get_wiki1.ipynb ADDED
@@ -0,0 +1,1422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "163a0ec0-2003-40f8-bf8a-63c1146f9130",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "\"\\nimport os\\n\\n# 设置环境变量\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
13
+ ]
14
+ },
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "output_type": "execute_result"
18
+ }
19
+ ],
20
+ "source": [
21
+ "import subprocess\n",
22
+ "import os\n",
23
+ "\n",
24
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
25
+ "output = result.stdout\n",
26
+ "for line in output.splitlines():\n",
27
+ " if '=' in line:\n",
28
+ " var, value = line.split('=', 1)\n",
29
+ " os.environ[var] = value\n",
30
+ "\n",
31
+ "\"\"\"\n",
32
+ "import os\n",
33
+ "\n",
34
+ "# 设置环境变量\n",
35
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
36
+ "\n",
37
+ "# 打印环境变量以确认设置成功\n",
38
+ "print(os.environ.get('HF_ENDPOINT'))\n",
39
+ "\"\"\""
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 2,
45
+ "id": "4aa15036-4be6-44ac-be6d-83389ae07c5d",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "# from datasets import load_dataset\n",
50
+ "\n",
51
+ "# # 加载特定语言的 Wikipedia 语料\n",
52
+ "# langs = [\"20220301.en\", \"20220301.fr\", \"20220301.de\", \"20220301.zh\", \"20220301.ja\"]\n",
53
+ "# datasets = {lang: load_dataset(\"wikimedia/wikipedia\", lang, split=\"train\") for lang in langs}"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 3,
59
+ "id": "02351414-fcad-44ba-964c-8ced0a88e609",
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "data": {
64
+ "application/vnd.jupyter.widget-view+json": {
65
+ "model_id": "408855cc406d47a3a47803d00829febd",
66
+ "version_major": 2,
67
+ "version_minor": 0
68
+ },
69
+ "text/plain": [
70
+ "Loading dataset shards: 0%| | 0/41 [00:00<?, ?it/s]"
71
+ ]
72
+ },
73
+ "metadata": {},
74
+ "output_type": "display_data"
75
+ },
76
+ {
77
+ "data": {
78
+ "text/plain": [
79
+ "DatasetDict({\n",
80
+ " train: Dataset({\n",
81
+ " features: ['id', 'url', 'title', 'text'],\n",
82
+ " num_rows: 6458670\n",
83
+ " })\n",
84
+ "})"
85
+ ]
86
+ },
87
+ "execution_count": 3,
88
+ "metadata": {},
89
+ "output_type": "execute_result"
90
+ }
91
+ ],
92
+ "source": [
93
+ "from datasets import load_dataset\n",
94
+ "datasets_en = load_dataset(\"legacy-datasets/wikipedia\", \"20220301.en\")\n",
95
+ "datasets_en"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 4,
101
+ "id": "f00a2e59-0aed-446e-9c7b-d21233bcdd1c",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "wiki = open(\"en_wiki.txt\",\"w\")\n",
106
+ "for item in datasets_en[\"train\"]:\n",
107
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
108
+ " wiki.write(line)\n",
109
+ "wiki.close()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 5,
115
+ "id": "ce2039da-ae06-49de-87da-bc452ee1f252",
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "data": {
120
+ "application/vnd.jupyter.widget-view+json": {
121
+ "model_id": "d240fe1bef53424e9ccbb8af29346f0c",
122
+ "version_major": 2,
123
+ "version_minor": 0
124
+ },
125
+ "text/plain": [
126
+ "train-00000-of-00015.parquet: 0%| | 0.00/764M [00:00<?, ?B/s]"
127
+ ]
128
+ },
129
+ "metadata": {},
130
+ "output_type": "display_data"
131
+ },
132
+ {
133
+ "data": {
134
+ "application/vnd.jupyter.widget-view+json": {
135
+ "model_id": "807912e6be304fcaa0786f2c863fa8ab",
136
+ "version_major": 2,
137
+ "version_minor": 0
138
+ },
139
+ "text/plain": [
140
+ "train-00001-of-00015.parquet: 0%| | 0.00/416M [00:00<?, ?B/s]"
141
+ ]
142
+ },
143
+ "metadata": {},
144
+ "output_type": "display_data"
145
+ },
146
+ {
147
+ "data": {
148
+ "application/vnd.jupyter.widget-view+json": {
149
+ "model_id": "ff2349e16ea44a1a9033f04096cf0fe4",
150
+ "version_major": 2,
151
+ "version_minor": 0
152
+ },
153
+ "text/plain": [
154
+ "train-00002-of-00015.parquet: 0%| | 0.00/342M [00:00<?, ?B/s]"
155
+ ]
156
+ },
157
+ "metadata": {},
158
+ "output_type": "display_data"
159
+ },
160
+ {
161
+ "data": {
162
+ "application/vnd.jupyter.widget-view+json": {
163
+ "model_id": "428705e16387469abc2f70d8f01032c8",
164
+ "version_major": 2,
165
+ "version_minor": 0
166
+ },
167
+ "text/plain": [
168
+ "train-00003-of-00015.parquet: 0%| | 0.00/306M [00:00<?, ?B/s]"
169
+ ]
170
+ },
171
+ "metadata": {},
172
+ "output_type": "display_data"
173
+ },
174
+ {
175
+ "data": {
176
+ "application/vnd.jupyter.widget-view+json": {
177
+ "model_id": "3edadedc0767441fa441450e541d0a41",
178
+ "version_major": 2,
179
+ "version_minor": 0
180
+ },
181
+ "text/plain": [
182
+ "train-00004-of-00015.parquet: 0%| | 0.00/281M [00:00<?, ?B/s]"
183
+ ]
184
+ },
185
+ "metadata": {},
186
+ "output_type": "display_data"
187
+ },
188
+ {
189
+ "data": {
190
+ "application/vnd.jupyter.widget-view+json": {
191
+ "model_id": "485c9747448e4fc792c748e3920d893c",
192
+ "version_major": 2,
193
+ "version_minor": 0
194
+ },
195
+ "text/plain": [
196
+ "train-00005-of-00015.parquet: 0%| | 0.00/261M [00:00<?, ?B/s]"
197
+ ]
198
+ },
199
+ "metadata": {},
200
+ "output_type": "display_data"
201
+ },
202
+ {
203
+ "data": {
204
+ "application/vnd.jupyter.widget-view+json": {
205
+ "model_id": "32acc84b51494535aff3fe9edf17d0f9",
206
+ "version_major": 2,
207
+ "version_minor": 0
208
+ },
209
+ "text/plain": [
210
+ "train-00006-of-00015.parquet: 0%| | 0.00/220M [00:00<?, ?B/s]"
211
+ ]
212
+ },
213
+ "metadata": {},
214
+ "output_type": "display_data"
215
+ },
216
+ {
217
+ "data": {
218
+ "application/vnd.jupyter.widget-view+json": {
219
+ "model_id": "1626f0e6390542298f9c22e659e95d52",
220
+ "version_major": 2,
221
+ "version_minor": 0
222
+ },
223
+ "text/plain": [
224
+ "train-00007-of-00015.parquet: 0%| | 0.00/210M [00:00<?, ?B/s]"
225
+ ]
226
+ },
227
+ "metadata": {},
228
+ "output_type": "display_data"
229
+ },
230
+ {
231
+ "data": {
232
+ "application/vnd.jupyter.widget-view+json": {
233
+ "model_id": "5c9d6d941a564add86f644daa4710850",
234
+ "version_major": 2,
235
+ "version_minor": 0
236
+ },
237
+ "text/plain": [
238
+ "train-00008-of-00015.parquet: 0%| | 0.00/215M [00:00<?, ?B/s]"
239
+ ]
240
+ },
241
+ "metadata": {},
242
+ "output_type": "display_data"
243
+ },
244
+ {
245
+ "data": {
246
+ "application/vnd.jupyter.widget-view+json": {
247
+ "model_id": "13b0fe00bca644d9b971793375c2bc3b",
248
+ "version_major": 2,
249
+ "version_minor": 0
250
+ },
251
+ "text/plain": [
252
+ "train-00009-of-00015.parquet: 0%| | 0.00/211M [00:00<?, ?B/s]"
253
+ ]
254
+ },
255
+ "metadata": {},
256
+ "output_type": "display_data"
257
+ },
258
+ {
259
+ "data": {
260
+ "application/vnd.jupyter.widget-view+json": {
261
+ "model_id": "60ff775f33b74bf6b9be369f4b38e116",
262
+ "version_major": 2,
263
+ "version_minor": 0
264
+ },
265
+ "text/plain": [
266
+ "train-00010-of-00015.parquet: 0%| | 0.00/181M [00:00<?, ?B/s]"
267
+ ]
268
+ },
269
+ "metadata": {},
270
+ "output_type": "display_data"
271
+ },
272
+ {
273
+ "data": {
274
+ "application/vnd.jupyter.widget-view+json": {
275
+ "model_id": "862390a8eff241b3b371e6db7404591a",
276
+ "version_major": 2,
277
+ "version_minor": 0
278
+ },
279
+ "text/plain": [
280
+ "train-00011-of-00015.parquet: 0%| | 0.00/197M [00:00<?, ?B/s]"
281
+ ]
282
+ },
283
+ "metadata": {},
284
+ "output_type": "display_data"
285
+ },
286
+ {
287
+ "data": {
288
+ "application/vnd.jupyter.widget-view+json": {
289
+ "model_id": "6162884ffd804cffb1f0d1e6a57e940f",
290
+ "version_major": 2,
291
+ "version_minor": 0
292
+ },
293
+ "text/plain": [
294
+ "train-00012-of-00015.parquet: 0%| | 0.00/180M [00:00<?, ?B/s]"
295
+ ]
296
+ },
297
+ "metadata": {},
298
+ "output_type": "display_data"
299
+ },
300
+ {
301
+ "data": {
302
+ "application/vnd.jupyter.widget-view+json": {
303
+ "model_id": "0d8e1f181e4043019e43704b685851f2",
304
+ "version_major": 2,
305
+ "version_minor": 0
306
+ },
307
+ "text/plain": [
308
+ "train-00013-of-00015.parquet: 0%| | 0.00/219M [00:00<?, ?B/s]"
309
+ ]
310
+ },
311
+ "metadata": {},
312
+ "output_type": "display_data"
313
+ },
314
+ {
315
+ "data": {
316
+ "application/vnd.jupyter.widget-view+json": {
317
+ "model_id": "fc7550b48dbe4d35b1c5e1c7ff52f0a8",
318
+ "version_major": 2,
319
+ "version_minor": 0
320
+ },
321
+ "text/plain": [
322
+ "train-00014-of-00015.parquet: 0%| | 0.00/220M [00:00<?, ?B/s]"
323
+ ]
324
+ },
325
+ "metadata": {},
326
+ "output_type": "display_data"
327
+ },
328
+ {
329
+ "data": {
330
+ "application/vnd.jupyter.widget-view+json": {
331
+ "model_id": "1d84178aa296430cb2e949c37fbde0e2",
332
+ "version_major": 2,
333
+ "version_minor": 0
334
+ },
335
+ "text/plain": [
336
+ "Generating train split: 0%| | 0/2402095 [00:00<?, ? examples/s]"
337
+ ]
338
+ },
339
+ "metadata": {},
340
+ "output_type": "display_data"
341
+ },
342
+ {
343
+ "data": {
344
+ "text/plain": [
345
+ "DatasetDict({\n",
346
+ " train: Dataset({\n",
347
+ " features: ['id', 'url', 'title', 'text'],\n",
348
+ " num_rows: 2402095\n",
349
+ " })\n",
350
+ "})"
351
+ ]
352
+ },
353
+ "execution_count": 5,
354
+ "metadata": {},
355
+ "output_type": "execute_result"
356
+ }
357
+ ],
358
+ "source": [
359
+ "datasets_wiki = load_dataset(\"legacy-datasets/wikipedia\", \"20220301.fr\")\n",
360
+ "datasets_wiki"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 7,
366
+ "id": "ac610da1-9529-491a-a9f4-5d6f3a92cce7",
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "wiki = open(\"fr_wiki.txt\",\"w\")\n",
371
+ "for item in datasets_wiki[\"train\"]:\n",
372
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
373
+ " wiki.write(line)\n",
374
+ "wiki.close()"
375
+ ]
376
+ },
377
+ {
378
+ "cell_type": "code",
379
+ "execution_count": null,
380
+ "id": "8be4a828-affd-4b07-8821-16cbefb93bd6",
381
+ "metadata": {},
382
+ "outputs": [],
383
+ "source": []
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 8,
388
+ "id": "2a4a8425-8615-45c5-84cf-fcfb6a54727a",
389
+ "metadata": {},
390
+ "outputs": [
391
+ {
392
+ "data": {
393
+ "application/vnd.jupyter.widget-view+json": {
394
+ "model_id": "1ade008c4100486e9e47a52fadc1ece7",
395
+ "version_major": 2,
396
+ "version_minor": 0
397
+ },
398
+ "text/plain": [
399
+ "Downloading data: 0%| | 0/18 [00:00<?, ?files/s]"
400
+ ]
401
+ },
402
+ "metadata": {},
403
+ "output_type": "display_data"
404
+ },
405
+ {
406
+ "data": {
407
+ "application/vnd.jupyter.widget-view+json": {
408
+ "model_id": "bbe51753a90240d399d26dfd0bdb1aa6",
409
+ "version_major": 2,
410
+ "version_minor": 0
411
+ },
412
+ "text/plain": [
413
+ "train-00010-of-00018.parquet: 0%| | 0.00/239M [00:00<?, ?B/s]"
414
+ ]
415
+ },
416
+ "metadata": {},
417
+ "output_type": "display_data"
418
+ },
419
+ {
420
+ "data": {
421
+ "application/vnd.jupyter.widget-view+json": {
422
+ "model_id": "717f471eccca49e5a415f8b8b8c9c6e3",
423
+ "version_major": 2,
424
+ "version_minor": 0
425
+ },
426
+ "text/plain": [
427
+ "train-00011-of-00018.parquet: 0%| | 0.00/252M [00:00<?, ?B/s]"
428
+ ]
429
+ },
430
+ "metadata": {},
431
+ "output_type": "display_data"
432
+ },
433
+ {
434
+ "data": {
435
+ "application/vnd.jupyter.widget-view+json": {
436
+ "model_id": "1ea2609792144411afa2a855e2ff1c7d",
437
+ "version_major": 2,
438
+ "version_minor": 0
439
+ },
440
+ "text/plain": [
441
+ "train-00005-of-00018.parquet: 0%| | 0.00/283M [00:00<?, ?B/s]"
442
+ ]
443
+ },
444
+ "metadata": {},
445
+ "output_type": "display_data"
446
+ },
447
+ {
448
+ "data": {
449
+ "application/vnd.jupyter.widget-view+json": {
450
+ "model_id": "a8c88a13fd5d475085649381fe2fbba7",
451
+ "version_major": 2,
452
+ "version_minor": 0
453
+ },
454
+ "text/plain": [
455
+ "train-00008-of-00018.parquet: 0%| | 0.00/241M [00:00<?, ?B/s]"
456
+ ]
457
+ },
458
+ "metadata": {},
459
+ "output_type": "display_data"
460
+ },
461
+ {
462
+ "data": {
463
+ "application/vnd.jupyter.widget-view+json": {
464
+ "model_id": "0b63f5caf6c54225b35a3bdc915cc73e",
465
+ "version_major": 2,
466
+ "version_minor": 0
467
+ },
468
+ "text/plain": [
469
+ "train-00000-of-00018.parquet: 0%| | 0.00/773M [00:00<?, ?B/s]"
470
+ ]
471
+ },
472
+ "metadata": {},
473
+ "output_type": "display_data"
474
+ },
475
+ {
476
+ "data": {
477
+ "application/vnd.jupyter.widget-view+json": {
478
+ "model_id": "de0f6e9973ab427eaa937924b1134029",
479
+ "version_major": 2,
480
+ "version_minor": 0
481
+ },
482
+ "text/plain": [
483
+ "train-00001-of-00018.parquet: 0%| | 0.00/446M [00:00<?, ?B/s]"
484
+ ]
485
+ },
486
+ "metadata": {},
487
+ "output_type": "display_data"
488
+ },
489
+ {
490
+ "data": {
491
+ "application/vnd.jupyter.widget-view+json": {
492
+ "model_id": "beac8579da8244deadc2da23aec6ec6e",
493
+ "version_major": 2,
494
+ "version_minor": 0
495
+ },
496
+ "text/plain": [
497
+ "train-00015-of-00018.parquet: 0%| | 0.00/224M [00:00<?, ?B/s]"
498
+ ]
499
+ },
500
+ "metadata": {},
501
+ "output_type": "display_data"
502
+ },
503
+ {
504
+ "data": {
505
+ "application/vnd.jupyter.widget-view+json": {
506
+ "model_id": "293b420636b44bab986752b92c91a1b3",
507
+ "version_major": 2,
508
+ "version_minor": 0
509
+ },
510
+ "text/plain": [
511
+ "train-00009-of-00018.parquet: 0%| | 0.00/236M [00:00<?, ?B/s]"
512
+ ]
513
+ },
514
+ "metadata": {},
515
+ "output_type": "display_data"
516
+ },
517
+ {
518
+ "data": {
519
+ "application/vnd.jupyter.widget-view+json": {
520
+ "model_id": "d3d66b5d72ac472aa3ba20dae34f9695",
521
+ "version_major": 2,
522
+ "version_minor": 0
523
+ },
524
+ "text/plain": [
525
+ "train-00012-of-00018.parquet: 0%| | 0.00/245M [00:00<?, ?B/s]"
526
+ ]
527
+ },
528
+ "metadata": {},
529
+ "output_type": "display_data"
530
+ },
531
+ {
532
+ "data": {
533
+ "application/vnd.jupyter.widget-view+json": {
534
+ "model_id": "3be8b392a6614472b005ba9b9b1e1d59",
535
+ "version_major": 2,
536
+ "version_minor": 0
537
+ },
538
+ "text/plain": [
539
+ "train-00006-of-00018.parquet: 0%| | 0.00/277M [00:00<?, ?B/s]"
540
+ ]
541
+ },
542
+ "metadata": {},
543
+ "output_type": "display_data"
544
+ },
545
+ {
546
+ "data": {
547
+ "application/vnd.jupyter.widget-view+json": {
548
+ "model_id": "b371f42b410e4cbf9137666b4bac2647",
549
+ "version_major": 2,
550
+ "version_minor": 0
551
+ },
552
+ "text/plain": [
553
+ "train-00007-of-00018.parquet: 0%| | 0.00/256M [00:00<?, ?B/s]"
554
+ ]
555
+ },
556
+ "metadata": {},
557
+ "output_type": "display_data"
558
+ },
559
+ {
560
+ "data": {
561
+ "application/vnd.jupyter.widget-view+json": {
562
+ "model_id": "a9ab370c0f534b289fdac4ed3cddff3a",
563
+ "version_major": 2,
564
+ "version_minor": 0
565
+ },
566
+ "text/plain": [
567
+ "train-00002-of-00018.parquet: 0%| | 0.00/369M [00:00<?, ?B/s]"
568
+ ]
569
+ },
570
+ "metadata": {},
571
+ "output_type": "display_data"
572
+ },
573
+ {
574
+ "data": {
575
+ "application/vnd.jupyter.widget-view+json": {
576
+ "model_id": "dce437fe46e04c82b2f8ab7a3121592b",
577
+ "version_major": 2,
578
+ "version_minor": 0
579
+ },
580
+ "text/plain": [
581
+ "train-00003-of-00018.parquet: 0%| | 0.00/291M [00:00<?, ?B/s]"
582
+ ]
583
+ },
584
+ "metadata": {},
585
+ "output_type": "display_data"
586
+ },
587
+ {
588
+ "data": {
589
+ "application/vnd.jupyter.widget-view+json": {
590
+ "model_id": "5bec466db34b4daa926a6eef48273817",
591
+ "version_major": 2,
592
+ "version_minor": 0
593
+ },
594
+ "text/plain": [
595
+ "train-00014-of-00018.parquet: 0%| | 0.00/233M [00:00<?, ?B/s]"
596
+ ]
597
+ },
598
+ "metadata": {},
599
+ "output_type": "display_data"
600
+ },
601
+ {
602
+ "data": {
603
+ "application/vnd.jupyter.widget-view+json": {
604
+ "model_id": "0e47f796b69749ef8f3bf255300553ec",
605
+ "version_major": 2,
606
+ "version_minor": 0
607
+ },
608
+ "text/plain": [
609
+ "train-00004-of-00018.parquet: 0%| | 0.00/300M [00:00<?, ?B/s]"
610
+ ]
611
+ },
612
+ "metadata": {},
613
+ "output_type": "display_data"
614
+ },
615
+ {
616
+ "data": {
617
+ "application/vnd.jupyter.widget-view+json": {
618
+ "model_id": "b6dd9bf441d24ffa886996c210b4607f",
619
+ "version_major": 2,
620
+ "version_minor": 0
621
+ },
622
+ "text/plain": [
623
+ "train-00013-of-00018.parquet: 0%| | 0.00/233M [00:00<?, ?B/s]"
624
+ ]
625
+ },
626
+ "metadata": {},
627
+ "output_type": "display_data"
628
+ },
629
+ {
630
+ "data": {
631
+ "application/vnd.jupyter.widget-view+json": {
632
+ "model_id": "0fa5351d1be542b88a3024b900b0ef19",
633
+ "version_major": 2,
634
+ "version_minor": 0
635
+ },
636
+ "text/plain": [
637
+ "train-00016-of-00018.parquet: 0%| | 0.00/224M [00:00<?, ?B/s]"
638
+ ]
639
+ },
640
+ "metadata": {},
641
+ "output_type": "display_data"
642
+ },
643
+ {
644
+ "data": {
645
+ "application/vnd.jupyter.widget-view+json": {
646
+ "model_id": "1452efc3d5564ef99ae336dbcc36b22f",
647
+ "version_major": 2,
648
+ "version_minor": 0
649
+ },
650
+ "text/plain": [
651
+ "train-00017-of-00018.parquet: 0%| | 0.00/222M [00:00<?, ?B/s]"
652
+ ]
653
+ },
654
+ "metadata": {},
655
+ "output_type": "display_data"
656
+ },
657
+ {
658
+ "data": {
659
+ "application/vnd.jupyter.widget-view+json": {
660
+ "model_id": "c12fa5a2d21241c6a31b4c889999ad0a",
661
+ "version_major": 2,
662
+ "version_minor": 0
663
+ },
664
+ "text/plain": [
665
+ "Generating train split: 0%| | 0/2665357 [00:00<?, ? examples/s]"
666
+ ]
667
+ },
668
+ "metadata": {},
669
+ "output_type": "display_data"
670
+ },
671
+ {
672
+ "data": {
673
+ "application/vnd.jupyter.widget-view+json": {
674
+ "model_id": "ca6b84a24ecf4e00bf8e550ba5eb5054",
675
+ "version_major": 2,
676
+ "version_minor": 0
677
+ },
678
+ "text/plain": [
679
+ "Loading dataset shards: 0%| | 0/18 [00:00<?, ?it/s]"
680
+ ]
681
+ },
682
+ "metadata": {},
683
+ "output_type": "display_data"
684
+ },
685
+ {
686
+ "data": {
687
+ "text/plain": [
688
+ "DatasetDict({\n",
689
+ " train: Dataset({\n",
690
+ " features: ['id', 'url', 'title', 'text'],\n",
691
+ " num_rows: 2665357\n",
692
+ " })\n",
693
+ "})"
694
+ ]
695
+ },
696
+ "execution_count": 8,
697
+ "metadata": {},
698
+ "output_type": "execute_result"
699
+ }
700
+ ],
701
+ "source": [
702
+ "datasets_wiki = load_dataset(\"legacy-datasets/wikipedia\", \"20220301.de\")\n",
703
+ "datasets_wiki"
704
+ ]
705
+ },
706
+ {
707
+ "cell_type": "code",
708
+ "execution_count": 9,
709
+ "id": "1c60e53d-3031-44e7-b1da-edc419079f13",
710
+ "metadata": {},
711
+ "outputs": [],
712
+ "source": [
713
+ "wiki = open(\"de_wiki.txt\",\"w\")\n",
714
+ "for item in datasets_wiki[\"train\"]:\n",
715
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
716
+ " wiki.write(line)\n",
717
+ "wiki.close()"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "execution_count": null,
723
+ "id": "c42100db-79d9-4cbe-9582-e43d58567173",
724
+ "metadata": {},
725
+ "outputs": [],
726
+ "source": []
727
+ },
728
+ {
729
+ "cell_type": "code",
730
+ "execution_count": 12,
731
+ "id": "eee3698d-ccb8-4cec-9796-e93886fd6b1e",
732
+ "metadata": {},
733
+ "outputs": [
734
+ {
735
+ "data": {
736
+ "application/vnd.jupyter.widget-view+json": {
737
+ "model_id": "a03b4c97410645eaaae45f25e1c03108",
738
+ "version_major": 2,
739
+ "version_minor": 0
740
+ },
741
+ "text/plain": [
742
+ "train-00000-of-00015.parquet: 0%| | 0.00/612M [00:00<?, ?B/s]"
743
+ ]
744
+ },
745
+ "metadata": {},
746
+ "output_type": "display_data"
747
+ },
748
+ {
749
+ "data": {
750
+ "application/vnd.jupyter.widget-view+json": {
751
+ "model_id": "5a03f25cd8354e3299b82554c0fc3287",
752
+ "version_major": 2,
753
+ "version_minor": 0
754
+ },
755
+ "text/plain": [
756
+ "train-00001-of-00015.parquet: 0%| | 0.00/383M [00:00<?, ?B/s]"
757
+ ]
758
+ },
759
+ "metadata": {},
760
+ "output_type": "display_data"
761
+ },
762
+ {
763
+ "data": {
764
+ "application/vnd.jupyter.widget-view+json": {
765
+ "model_id": "8a63110f3cf648eda442ee3177aee2f6",
766
+ "version_major": 2,
767
+ "version_minor": 0
768
+ },
769
+ "text/plain": [
770
+ "train-00002-of-00015.parquet: 0%| | 0.00/305M [00:00<?, ?B/s]"
771
+ ]
772
+ },
773
+ "metadata": {},
774
+ "output_type": "display_data"
775
+ },
776
+ {
777
+ "data": {
778
+ "application/vnd.jupyter.widget-view+json": {
779
+ "model_id": "09ec9ce700084abcb944f6b6669ebf15",
780
+ "version_major": 2,
781
+ "version_minor": 0
782
+ },
783
+ "text/plain": [
784
+ "train-00003-of-00015.parquet: 0%| | 0.00/252M [00:00<?, ?B/s]"
785
+ ]
786
+ },
787
+ "metadata": {},
788
+ "output_type": "display_data"
789
+ },
790
+ {
791
+ "data": {
792
+ "application/vnd.jupyter.widget-view+json": {
793
+ "model_id": "363a3b6272c24e23a0e708864c872310",
794
+ "version_major": 2,
795
+ "version_minor": 0
796
+ },
797
+ "text/plain": [
798
+ "train-00004-of-00015.parquet: 0%| | 0.00/240M [00:00<?, ?B/s]"
799
+ ]
800
+ },
801
+ "metadata": {},
802
+ "output_type": "display_data"
803
+ },
804
+ {
805
+ "data": {
806
+ "application/vnd.jupyter.widget-view+json": {
807
+ "model_id": "58b7f596d96246439e7cc21acc333fb4",
808
+ "version_major": 2,
809
+ "version_minor": 0
810
+ },
811
+ "text/plain": [
812
+ "train-00005-of-00015.parquet: 0%| | 0.00/240M [00:00<?, ?B/s]"
813
+ ]
814
+ },
815
+ "metadata": {},
816
+ "output_type": "display_data"
817
+ },
818
+ {
819
+ "data": {
820
+ "application/vnd.jupyter.widget-view+json": {
821
+ "model_id": "2872cc14f4944b4894bd05bbb256d892",
822
+ "version_major": 2,
823
+ "version_minor": 0
824
+ },
825
+ "text/plain": [
826
+ "train-00006-of-00015.parquet: 0%| | 0.00/233M [00:00<?, ?B/s]"
827
+ ]
828
+ },
829
+ "metadata": {},
830
+ "output_type": "display_data"
831
+ },
832
+ {
833
+ "data": {
834
+ "application/vnd.jupyter.widget-view+json": {
835
+ "model_id": "3d65c3b76a02445e85d6723ea19565c2",
836
+ "version_major": 2,
837
+ "version_minor": 0
838
+ },
839
+ "text/plain": [
840
+ "train-00007-of-00015.parquet: 0%| | 0.00/218M [00:00<?, ?B/s]"
841
+ ]
842
+ },
843
+ "metadata": {},
844
+ "output_type": "display_data"
845
+ },
846
+ {
847
+ "data": {
848
+ "application/vnd.jupyter.widget-view+json": {
849
+ "model_id": "a77dd5822bd34896b4dbd4d22cb8331c",
850
+ "version_major": 2,
851
+ "version_minor": 0
852
+ },
853
+ "text/plain": [
854
+ "train-00008-of-00015.parquet: 0%| | 0.00/224M [00:00<?, ?B/s]"
855
+ ]
856
+ },
857
+ "metadata": {},
858
+ "output_type": "display_data"
859
+ },
860
+ {
861
+ "data": {
862
+ "application/vnd.jupyter.widget-view+json": {
863
+ "model_id": "52ce12211c8743acac275af2c7ac049c",
864
+ "version_major": 2,
865
+ "version_minor": 0
866
+ },
867
+ "text/plain": [
868
+ "train-00009-of-00015.parquet: 0%| | 0.00/235M [00:00<?, ?B/s]"
869
+ ]
870
+ },
871
+ "metadata": {},
872
+ "output_type": "display_data"
873
+ },
874
+ {
875
+ "data": {
876
+ "application/vnd.jupyter.widget-view+json": {
877
+ "model_id": "9da3788934f14b15a63e4add38e989b4",
878
+ "version_major": 2,
879
+ "version_minor": 0
880
+ },
881
+ "text/plain": [
882
+ "train-00010-of-00015.parquet: 0%| | 0.00/217M [00:00<?, ?B/s]"
883
+ ]
884
+ },
885
+ "metadata": {},
886
+ "output_type": "display_data"
887
+ },
888
+ {
889
+ "data": {
890
+ "application/vnd.jupyter.widget-view+json": {
891
+ "model_id": "06254bec73a24830b685468235260f0c",
892
+ "version_major": 2,
893
+ "version_minor": 0
894
+ },
895
+ "text/plain": [
896
+ "train-00011-of-00015.parquet: 0%| | 0.00/213M [00:00<?, ?B/s]"
897
+ ]
898
+ },
899
+ "metadata": {},
900
+ "output_type": "display_data"
901
+ },
902
+ {
903
+ "data": {
904
+ "application/vnd.jupyter.widget-view+json": {
905
+ "model_id": "d25a6d36b72446458525dd060d63102a",
906
+ "version_major": 2,
907
+ "version_minor": 0
908
+ },
909
+ "text/plain": [
910
+ "train-00012-of-00015.parquet: 0%| | 0.00/201M [00:00<?, ?B/s]"
911
+ ]
912
+ },
913
+ "metadata": {},
914
+ "output_type": "display_data"
915
+ },
916
+ {
917
+ "data": {
918
+ "application/vnd.jupyter.widget-view+json": {
919
+ "model_id": "4da023af77894a72a1e6223c91820983",
920
+ "version_major": 2,
921
+ "version_minor": 0
922
+ },
923
+ "text/plain": [
924
+ "train-00013-of-00015.parquet: 0%| | 0.00/193M [00:00<?, ?B/s]"
925
+ ]
926
+ },
927
+ "metadata": {},
928
+ "output_type": "display_data"
929
+ },
930
+ {
931
+ "data": {
932
+ "application/vnd.jupyter.widget-view+json": {
933
+ "model_id": "6c6d63c3e4df46c69da6b42c8b7eae4a",
934
+ "version_major": 2,
935
+ "version_minor": 0
936
+ },
937
+ "text/plain": [
938
+ "train-00014-of-00015.parquet: 0%| | 0.00/176M [00:00<?, ?B/s]"
939
+ ]
940
+ },
941
+ "metadata": {},
942
+ "output_type": "display_data"
943
+ },
944
+ {
945
+ "data": {
946
+ "application/vnd.jupyter.widget-view+json": {
947
+ "model_id": "f98c43bad8a140cca38d7c00232f884a",
948
+ "version_major": 2,
949
+ "version_minor": 0
950
+ },
951
+ "text/plain": [
952
+ "Generating train split: 0%| | 0/1389467 [00:00<?, ? examples/s]"
953
+ ]
954
+ },
955
+ "metadata": {},
956
+ "output_type": "display_data"
957
+ },
958
+ {
959
+ "data": {
960
+ "text/plain": [
961
+ "DatasetDict({\n",
962
+ " train: Dataset({\n",
963
+ " features: ['id', 'url', 'title', 'text'],\n",
964
+ " num_rows: 1389467\n",
965
+ " })\n",
966
+ "})"
967
+ ]
968
+ },
969
+ "execution_count": 12,
970
+ "metadata": {},
971
+ "output_type": "execute_result"
972
+ }
973
+ ],
974
+ "source": [
975
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.ja\")\n",
976
+ "datasets_wiki"
977
+ ]
978
+ },
979
+ {
980
+ "cell_type": "code",
981
+ "execution_count": 14,
982
+ "id": "4b61a5b0-85ca-4a0e-99b0-0ea2e3bc9e82",
983
+ "metadata": {},
984
+ "outputs": [],
985
+ "source": [
986
+ "wiki = open(\"ja_wiki.txt\",\"w\")\n",
987
+ "for item in datasets_wiki[\"train\"]:\n",
988
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
989
+ " wiki.write(line)\n",
990
+ "wiki.close()"
991
+ ]
992
+ },
993
+ {
994
+ "cell_type": "code",
995
+ "execution_count": null,
996
+ "id": "5c56f7fd-8188-4d30-928d-e307764f23f9",
997
+ "metadata": {},
998
+ "outputs": [],
999
+ "source": []
1000
+ },
1001
+ {
1002
+ "cell_type": "code",
1003
+ "execution_count": 15,
1004
+ "id": "1668c12c-cbfe-4ea8-82c6-4e27607e9b67",
1005
+ "metadata": {},
1006
+ "outputs": [
1007
+ {
1008
+ "data": {
1009
+ "application/vnd.jupyter.widget-view+json": {
1010
+ "model_id": "8f8cd7bb3992440c89087f809b96832d",
1011
+ "version_major": 2,
1012
+ "version_minor": 0
1013
+ },
1014
+ "text/plain": [
1015
+ "train-00000-of-00013.parquet: 0%| | 0.00/688M [00:00<?, ?B/s]"
1016
+ ]
1017
+ },
1018
+ "metadata": {},
1019
+ "output_type": "display_data"
1020
+ },
1021
+ {
1022
+ "data": {
1023
+ "application/vnd.jupyter.widget-view+json": {
1024
+ "model_id": "b6c6f856ff3f4070a26e56bfacca6deb",
1025
+ "version_major": 2,
1026
+ "version_minor": 0
1027
+ },
1028
+ "text/plain": [
1029
+ "train-00001-of-00013.parquet: 0%| | 0.00/376M [00:00<?, ?B/s]"
1030
+ ]
1031
+ },
1032
+ "metadata": {},
1033
+ "output_type": "display_data"
1034
+ },
1035
+ {
1036
+ "data": {
1037
+ "application/vnd.jupyter.widget-view+json": {
1038
+ "model_id": "d90dc9bb27ef418e906109adf3e7fad3",
1039
+ "version_major": 2,
1040
+ "version_minor": 0
1041
+ },
1042
+ "text/plain": [
1043
+ "train-00002-of-00013.parquet: 0%| | 0.00/287M [00:00<?, ?B/s]"
1044
+ ]
1045
+ },
1046
+ "metadata": {},
1047
+ "output_type": "display_data"
1048
+ },
1049
+ {
1050
+ "data": {
1051
+ "application/vnd.jupyter.widget-view+json": {
1052
+ "model_id": "e72006a2edd24c3a9bd88a0e2ac57d7b",
1053
+ "version_major": 2,
1054
+ "version_minor": 0
1055
+ },
1056
+ "text/plain": [
1057
+ "train-00003-of-00013.parquet: 0%| | 0.00/245M [00:00<?, ?B/s]"
1058
+ ]
1059
+ },
1060
+ "metadata": {},
1061
+ "output_type": "display_data"
1062
+ },
1063
+ {
1064
+ "data": {
1065
+ "application/vnd.jupyter.widget-view+json": {
1066
+ "model_id": "df667cedb5374cd39e86e52c6cea3a7a",
1067
+ "version_major": 2,
1068
+ "version_minor": 0
1069
+ },
1070
+ "text/plain": [
1071
+ "train-00004-of-00013.parquet: 0%| | 0.00/168M [00:00<?, ?B/s]"
1072
+ ]
1073
+ },
1074
+ "metadata": {},
1075
+ "output_type": "display_data"
1076
+ },
1077
+ {
1078
+ "data": {
1079
+ "application/vnd.jupyter.widget-view+json": {
1080
+ "model_id": "6b005307539743769e0ac974c2996d9b",
1081
+ "version_major": 2,
1082
+ "version_minor": 0
1083
+ },
1084
+ "text/plain": [
1085
+ "train-00005-of-00013.parquet: 0%| | 0.00/178M [00:00<?, ?B/s]"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ },
1091
+ {
1092
+ "data": {
1093
+ "application/vnd.jupyter.widget-view+json": {
1094
+ "model_id": "c749d7a330f24d2fa684857be0700cc3",
1095
+ "version_major": 2,
1096
+ "version_minor": 0
1097
+ },
1098
+ "text/plain": [
1099
+ "train-00006-of-00013.parquet: 0%| | 0.00/216M [00:00<?, ?B/s]"
1100
+ ]
1101
+ },
1102
+ "metadata": {},
1103
+ "output_type": "display_data"
1104
+ },
1105
+ {
1106
+ "data": {
1107
+ "application/vnd.jupyter.widget-view+json": {
1108
+ "model_id": "7b07061ea4694ba8af395eeae31bd1fd",
1109
+ "version_major": 2,
1110
+ "version_minor": 0
1111
+ },
1112
+ "text/plain": [
1113
+ "train-00007-of-00013.parquet: 0%| | 0.00/241M [00:00<?, ?B/s]"
1114
+ ]
1115
+ },
1116
+ "metadata": {},
1117
+ "output_type": "display_data"
1118
+ },
1119
+ {
1120
+ "data": {
1121
+ "application/vnd.jupyter.widget-view+json": {
1122
+ "model_id": "0e07a225a3f24d9a9c92ad9ee153b066",
1123
+ "version_major": 2,
1124
+ "version_minor": 0
1125
+ },
1126
+ "text/plain": [
1127
+ "train-00008-of-00013.parquet: 0%| | 0.00/227M [00:00<?, ?B/s]"
1128
+ ]
1129
+ },
1130
+ "metadata": {},
1131
+ "output_type": "display_data"
1132
+ },
1133
+ {
1134
+ "data": {
1135
+ "application/vnd.jupyter.widget-view+json": {
1136
+ "model_id": "3054fd1470a34bbab1c6dee28dcb2441",
1137
+ "version_major": 2,
1138
+ "version_minor": 0
1139
+ },
1140
+ "text/plain": [
1141
+ "train-00009-of-00013.parquet: 0%| | 0.00/223M [00:00<?, ?B/s]"
1142
+ ]
1143
+ },
1144
+ "metadata": {},
1145
+ "output_type": "display_data"
1146
+ },
1147
+ {
1148
+ "data": {
1149
+ "application/vnd.jupyter.widget-view+json": {
1150
+ "model_id": "90f7a09fb95b4c4a926711a258c32107",
1151
+ "version_major": 2,
1152
+ "version_minor": 0
1153
+ },
1154
+ "text/plain": [
1155
+ "train-00010-of-00013.parquet: 0%| | 0.00/167M [00:00<?, ?B/s]"
1156
+ ]
1157
+ },
1158
+ "metadata": {},
1159
+ "output_type": "display_data"
1160
+ },
1161
+ {
1162
+ "data": {
1163
+ "application/vnd.jupyter.widget-view+json": {
1164
+ "model_id": "f2ceb836cfca47e2ba75659dacfc9739",
1165
+ "version_major": 2,
1166
+ "version_minor": 0
1167
+ },
1168
+ "text/plain": [
1169
+ "train-00011-of-00013.parquet: 0%| | 0.00/254M [00:00<?, ?B/s]"
1170
+ ]
1171
+ },
1172
+ "metadata": {},
1173
+ "output_type": "display_data"
1174
+ },
1175
+ {
1176
+ "data": {
1177
+ "application/vnd.jupyter.widget-view+json": {
1178
+ "model_id": "b20a7cea4fb146a5810cde198ef60ab6",
1179
+ "version_major": 2,
1180
+ "version_minor": 0
1181
+ },
1182
+ "text/plain": [
1183
+ "train-00012-of-00013.parquet: 0%| | 0.00/226M [00:00<?, ?B/s]"
1184
+ ]
1185
+ },
1186
+ "metadata": {},
1187
+ "output_type": "display_data"
1188
+ },
1189
+ {
1190
+ "data": {
1191
+ "application/vnd.jupyter.widget-view+json": {
1192
+ "model_id": "ba34e3fba4be480cbf145281eb834d64",
1193
+ "version_major": 2,
1194
+ "version_minor": 0
1195
+ },
1196
+ "text/plain": [
1197
+ "Generating train split: 0%| | 0/1841155 [00:00<?, ? examples/s]"
1198
+ ]
1199
+ },
1200
+ "metadata": {},
1201
+ "output_type": "display_data"
1202
+ },
1203
+ {
1204
+ "data": {
1205
+ "text/plain": [
1206
+ "DatasetDict({\n",
1207
+ " train: Dataset({\n",
1208
+ " features: ['id', 'url', 'title', 'text'],\n",
1209
+ " num_rows: 1841155\n",
1210
+ " })\n",
1211
+ "})"
1212
+ ]
1213
+ },
1214
+ "execution_count": 15,
1215
+ "metadata": {},
1216
+ "output_type": "execute_result"
1217
+ }
1218
+ ],
1219
+ "source": [
1220
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.es\")\n",
1221
+ "datasets_wiki"
1222
+ ]
1223
+ },
1224
+ {
1225
+ "cell_type": "code",
1226
+ "execution_count": 16,
1227
+ "id": "9b3d2f1d-47c3-4a4d-8f60-cf5874ec643b",
1228
+ "metadata": {},
1229
+ "outputs": [],
1230
+ "source": [
1231
+ "wiki = open(\"es_wiki.txt\",\"w\")\n",
1232
+ "for item in datasets_wiki[\"train\"]:\n",
1233
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
1234
+ " wiki.write(line)\n",
1235
+ "wiki.close()"
1236
+ ]
1237
+ },
1238
+ {
1239
+ "cell_type": "code",
1240
+ "execution_count": null,
1241
+ "id": "4d300900-2d36-4f21-a799-17b466239666",
1242
+ "metadata": {},
1243
+ "outputs": [],
1244
+ "source": []
1245
+ },
1246
+ {
1247
+ "cell_type": "code",
1248
+ "execution_count": 17,
1249
+ "id": "1d20d238-3015-443f-956a-ca1a7c622001",
1250
+ "metadata": {},
1251
+ "outputs": [
1252
+ {
1253
+ "data": {
1254
+ "text/plain": [
1255
+ "DatasetDict({\n",
1256
+ " train: Dataset({\n",
1257
+ " features: ['id', 'url', 'title', 'text'],\n",
1258
+ " num_rows: 1384748\n",
1259
+ " })\n",
1260
+ "})"
1261
+ ]
1262
+ },
1263
+ "execution_count": 17,
1264
+ "metadata": {},
1265
+ "output_type": "execute_result"
1266
+ }
1267
+ ],
1268
+ "source": [
1269
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.zh\")\n",
1270
+ "datasets_wiki"
1271
+ ]
1272
+ },
1273
+ {
1274
+ "cell_type": "code",
1275
+ "execution_count": 18,
1276
+ "id": "eac16b65-44d1-43ae-9aeb-c06a470daf0e",
1277
+ "metadata": {},
1278
+ "outputs": [],
1279
+ "source": [
1280
+ "wiki = open(\"zh_wiki.txt\",\"w\")\n",
1281
+ "for item in datasets_wiki[\"train\"]:\n",
1282
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
1283
+ " wiki.write(line)\n",
1284
+ "wiki.close()"
1285
+ ]
1286
+ },
1287
+ {
1288
+ "cell_type": "code",
1289
+ "execution_count": null,
1290
+ "id": "67c13fd3-ce11-41ce-8ca9-7d1d783310fb",
1291
+ "metadata": {},
1292
+ "outputs": [],
1293
+ "source": []
1294
+ },
1295
+ {
1296
+ "cell_type": "code",
1297
+ "execution_count": 19,
1298
+ "id": "8d4f223d-accc-44f5-acf3-5eca7621e960",
1299
+ "metadata": {},
1300
+ "outputs": [
1301
+ {
1302
+ "data": {
1303
+ "application/vnd.jupyter.widget-view+json": {
1304
+ "model_id": "2b39607ab45b4449b96d05858de65c90",
1305
+ "version_major": 2,
1306
+ "version_minor": 0
1307
+ },
1308
+ "text/plain": [
1309
+ "train-00000-of-00003.parquet: 0%| | 0.00/400M [00:00<?, ?B/s]"
1310
+ ]
1311
+ },
1312
+ "metadata": {},
1313
+ "output_type": "display_data"
1314
+ },
1315
+ {
1316
+ "data": {
1317
+ "application/vnd.jupyter.widget-view+json": {
1318
+ "model_id": "c930ca1ff58a4ba2995158c1944335c2",
1319
+ "version_major": 2,
1320
+ "version_minor": 0
1321
+ },
1322
+ "text/plain": [
1323
+ "train-00001-of-00003.parquet: 0%| | 0.00/205M [00:00<?, ?B/s]"
1324
+ ]
1325
+ },
1326
+ "metadata": {},
1327
+ "output_type": "display_data"
1328
+ },
1329
+ {
1330
+ "data": {
1331
+ "application/vnd.jupyter.widget-view+json": {
1332
+ "model_id": "37654989816342af9a355a2cb6e770b4",
1333
+ "version_major": 2,
1334
+ "version_minor": 0
1335
+ },
1336
+ "text/plain": [
1337
+ "train-00002-of-00003.parquet: 0%| | 0.00/177M [00:00<?, ?B/s]"
1338
+ ]
1339
+ },
1340
+ "metadata": {},
1341
+ "output_type": "display_data"
1342
+ },
1343
+ {
1344
+ "data": {
1345
+ "application/vnd.jupyter.widget-view+json": {
1346
+ "model_id": "b0dbd13bc9dc457eae92438fd63947ae",
1347
+ "version_major": 2,
1348
+ "version_minor": 0
1349
+ },
1350
+ "text/plain": [
1351
+ "Generating train split: 0%| | 0/647897 [00:00<?, ? examples/s]"
1352
+ ]
1353
+ },
1354
+ "metadata": {},
1355
+ "output_type": "display_data"
1356
+ },
1357
+ {
1358
+ "data": {
1359
+ "text/plain": [
1360
+ "DatasetDict({\n",
1361
+ " train: Dataset({\n",
1362
+ " features: ['id', 'url', 'title', 'text'],\n",
1363
+ " num_rows: 647897\n",
1364
+ " })\n",
1365
+ "})"
1366
+ ]
1367
+ },
1368
+ "execution_count": 19,
1369
+ "metadata": {},
1370
+ "output_type": "execute_result"
1371
+ }
1372
+ ],
1373
+ "source": [
1374
+ "datasets_wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.ko\")\n",
1375
+ "datasets_wiki"
1376
+ ]
1377
+ },
1378
+ {
1379
+ "cell_type": "code",
1380
+ "execution_count": 21,
1381
+ "id": "1af1e848-559a-4882-a3ce-966a63edac44",
1382
+ "metadata": {},
1383
+ "outputs": [],
1384
+ "source": [
1385
+ "wiki = open(\"ko_wiki.txt\",\"w\")\n",
1386
+ "for item in datasets_wiki[\"train\"]:\n",
1387
+ " line = item[\"title\"] + \" \"+ item[\"text\"] + \"\\n\"\n",
1388
+ " wiki.write(line)\n",
1389
+ "wiki.close()"
1390
+ ]
1391
+ },
1392
+ {
1393
+ "cell_type": "code",
1394
+ "execution_count": null,
1395
+ "id": "e4b3810a-f564-4a4e-8ccc-5f23f8541414",
1396
+ "metadata": {},
1397
+ "outputs": [],
1398
+ "source": []
1399
+ }
1400
+ ],
1401
+ "metadata": {
1402
+ "kernelspec": {
1403
+ "display_name": "Python 3 (ipykernel)",
1404
+ "language": "python",
1405
+ "name": "python3"
1406
+ },
1407
+ "language_info": {
1408
+ "codemirror_mode": {
1409
+ "name": "ipython",
1410
+ "version": 3
1411
+ },
1412
+ "file_extension": ".py",
1413
+ "mimetype": "text/x-python",
1414
+ "name": "python",
1415
+ "nbconvert_exporter": "python",
1416
+ "pygments_lexer": "ipython3",
1417
+ "version": "3.12.3"
1418
+ }
1419
+ },
1420
+ "nbformat": 4,
1421
+ "nbformat_minor": 5
1422
+ }
train_data/.ipynb_checkpoints/test-checkpoint.txt ADDED
The diff for this file is too large to render. See raw diff
 
train_data/de_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50fa5e0f7dc3d1e7e29b129b380b2ebe2226176e752fa168ce1327e5f4b11200
3
+ size 4354626975
train_data/dna_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3acc05887b61da97e202a564ba99c6b394f260f7bfa4306ba18c2637f40f7757
3
+ size 4052518950
train_data/en_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e910449e76e03672deed6983640b08629c7ccd9f4579812b5eb5663c13ac9bf
3
+ size 3953811499
train_data/es_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b31bbcdea9751517707b52144a133d61a1df3cdba799b0c51dd9b307a6791b09
3
+ size 3928023424
train_data/fr_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4e6e85cc891270b855ee6747df6320760ef2c91118d2d7f6df6c80ebc3ef9d1
3
+ size 4794614101
train_data/gene_eng.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0122099bd827924fd85f178305635b81ecaa54f157f2d340e09727a65320faa2
3
+ size 12245273833
train_data/gene_eng_zh.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b73592b4ce380b050cf6e51a9f8b5eed4a82d48f47e17f4bef811b8572a42fdf
3
+ size 14897010606
train_data/gene_eng_zh_de_es.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aed6299fd5c812d09f9c5fe6a9f7f7dec9f6bcc7a4df13703859e19949eefd6
3
+ size 23179661005
train_data/ja_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e89b190a14ec963fbccf1ef0642c4f8d2b036a6c843974b961176ec0fc3ecd
3
+ size 4585860236
train_data/ko_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f42d67971f7cfbc05b309816ec8063310dac075ec8ca1c166fa0bfd4261f9d7
3
+ size 1344091574
train_data/protein_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0921aa4a0ed580363e67953c09e82785941596f53d9c6b21a6ae6bf298c9f52
3
+ size 4238943384
train_data/test.txt ADDED
The diff for this file is too large to render. See raw diff
 
train_data/zh_wiki_4g.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf18bd3e5917e94e3edac7a3b8bffd73ecbcc77806db69044a29f45591e34958
3
+ size 2651736773