Spaces:
Sleeping
Sleeping
| # datasets_loader.py | |
| from datasets import load_dataset, DatasetDict | |
| from transformers import AutoTokenizer | |
| def load_threat_dataset(path: str, tokenizer_name="bert-base-chinese"): | |
| """ | |
| Loads a dataset of Chinese cybercrime posts with labels. | |
| Expects columns: text, label | |
| """ | |
| raw = load_dataset("csv", data_files=path) | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
| def tokenize(batch): | |
| return tokenizer( | |
| batch["text"], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=256 | |
| ) | |
| tokenized = raw.map(tokenize, batched=True) | |
| return DatasetDict({ | |
| "train": tokenized["train"], | |
| "test": tokenized["test"] | |
| }) | |
| # Example: | |
| # ds = load_threat_dataset("dataset/threat_samples.csv") | |
| # print(ds["train"][0]) |