File size: 826 Bytes
b65b044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# datasets_loader.py

from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

def load_threat_dataset(path: str, tokenizer_name="bert-base-chinese"):
    """
    Loads a dataset of Chinese cybercrime posts with labels.
    Expects columns: text, label
    """
    raw = load_dataset("csv", data_files=path)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=256
        )

    tokenized = raw.map(tokenize, batched=True)

    return DatasetDict({
        "train": tokenized["train"],
        "test": tokenized["test"]
    })


# Example:
# ds = load_threat_dataset("dataset/threat_samples.csv")
# print(ds["train"][0])