S-Dreamer commited on
Commit
b65b044
·
verified ·
1 Parent(s): 90946e4

Create datasets_loader.py

Browse files
Files changed (1) hide show
  1. datasets_loader.py +33 -0
datasets_loader.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # datasets_loader.py
2
+
3
+ from datasets import load_dataset, DatasetDict
4
+ from transformers import AutoTokenizer
5
+
6
+ def load_threat_dataset(path: str, tokenizer_name="bert-base-chinese"):
7
+ """
8
+ Loads a dataset of Chinese cybercrime posts with labels.
9
+ Expects columns: text, label
10
+ """
11
+ raw = load_dataset("csv", data_files=path)
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
14
+
15
+ def tokenize(batch):
16
+ return tokenizer(
17
+ batch["text"],
18
+ truncation=True,
19
+ padding="max_length",
20
+ max_length=256
21
+ )
22
+
23
+ tokenized = raw.map(tokenize, batched=True)
24
+
25
+ return DatasetDict({
26
+ "train": tokenized["train"],
27
+ "test": tokenized["test"]
28
+ })
29
+
30
+
31
+ # Example:
32
+ # ds = load_threat_dataset("dataset/threat_samples.csv")
33
+ # print(ds["train"][0])