Spaces:
Sleeping
Sleeping
File size: 826 Bytes
b65b044 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# datasets_loader.py
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
def load_threat_dataset(path: str, tokenizer_name="bert-base-chinese"):
"""
Loads a dataset of Chinese cybercrime posts with labels.
Expects columns: text, label
"""
raw = load_dataset("csv", data_files=path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
def tokenize(batch):
return tokenizer(
batch["text"],
truncation=True,
padding="max_length",
max_length=256
)
tokenized = raw.map(tokenize, batched=True)
return DatasetDict({
"train": tokenized["train"],
"test": tokenized["test"]
})
# Example:
# ds = load_threat_dataset("dataset/threat_samples.csv")
# print(ds["train"][0]) |