Spaces:
Running
Running
jisubae
commited on
Commit
Β·
5e8f045
0
Parent(s):
initial commit
Browse files- .gitattributes +35 -0
- .gitignore +114 -0
- Dockerfile +36 -0
- README.md +153 -0
- app.py +96 -0
- config.py +76 -0
- data/leaderboard_results.csv +1 -0
- data/public/ko-freshqa_2025_dev.csv +0 -0
- data/public/ko-freshqa_2025_test.csv +0 -0
- docker-compose.yml +19 -0
- env.example +97 -0
- environment.yml +21 -0
- freshqa/fresheval.py +358 -0
- freshqa/fresheval_parallel.py +113 -0
- freshqa/freshqa_acc.py +361 -0
- freshqa/merge_csv_with_model_response.py +187 -0
- requirements.txt +36 -0
- src/api_key_rotator.py +78 -0
- src/hf_private_csv_loader.py +279 -0
- src/leaderboard_manager.py +215 -0
- src/quick_csv_loader.py +158 -0
- src/submission_handler.py +615 -0
- src/submission_tracker.py +304 -0
- src/utils.py +58 -0
- ui/dataset_tab.py +142 -0
- ui/leaderboard_tab.py +229 -0
- ui/styles.css +136 -0
- ui/submission_tab.py +98 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
MANIFEST
|
| 23 |
+
|
| 24 |
+
# Virtual environments
|
| 25 |
+
.env
|
| 26 |
+
.venv
|
| 27 |
+
env/
|
| 28 |
+
venv/
|
| 29 |
+
ENV/
|
| 30 |
+
env.bak/
|
| 31 |
+
venv.bak/
|
| 32 |
+
|
| 33 |
+
# IDE
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
*.swp
|
| 37 |
+
*.swo
|
| 38 |
+
*~
|
| 39 |
+
|
| 40 |
+
# OS
|
| 41 |
+
.DS_Store
|
| 42 |
+
.DS_Store?
|
| 43 |
+
._*
|
| 44 |
+
.Spotlight-V100
|
| 45 |
+
.Trashes
|
| 46 |
+
ehthumbs.db
|
| 47 |
+
Thumbs.db
|
| 48 |
+
|
| 49 |
+
# Project specific - Test and temporary files (but keep leaderboard_results.csv)
|
| 50 |
+
# leaderboard_results.csv # μ£Όμ μ²λ¦¬νμ¬ Git μΆμ νμ©
|
| 51 |
+
evaluation_results.json
|
| 52 |
+
test_model_responses.csv
|
| 53 |
+
*.log
|
| 54 |
+
|
| 55 |
+
# Private data (λ―Όκ°ν λ°μ΄ν°)
|
| 56 |
+
#data/private/
|
| 57 |
+
|
| 58 |
+
# Jupyter Notebook
|
| 59 |
+
.ipynb_checkpoints
|
| 60 |
+
|
| 61 |
+
# pyenv
|
| 62 |
+
.python-version
|
| 63 |
+
|
| 64 |
+
# pipenv
|
| 65 |
+
Pipfile.lock
|
| 66 |
+
|
| 67 |
+
# pytest
|
| 68 |
+
.pytest_cache/
|
| 69 |
+
.coverage
|
| 70 |
+
htmlcov/
|
| 71 |
+
|
| 72 |
+
# mypy
|
| 73 |
+
.mypy_cache/
|
| 74 |
+
.dmypy.json
|
| 75 |
+
dmypy.json
|
| 76 |
+
|
| 77 |
+
# Gradio temporary files
|
| 78 |
+
gradio_cached_examples/
|
| 79 |
+
flagged/
|
| 80 |
+
|
| 81 |
+
# Lock files for file locking mechanism
|
| 82 |
+
*.lock
|
| 83 |
+
|
| 84 |
+
# MacOS specific
|
| 85 |
+
.AppleDouble
|
| 86 |
+
.LSOverride
|
| 87 |
+
Icon
|
| 88 |
+
|
| 89 |
+
# Thumbnails
|
| 90 |
+
._*
|
| 91 |
+
|
| 92 |
+
# Files that might appear in the root of a volume
|
| 93 |
+
.DocumentRevisions-V100
|
| 94 |
+
.fseventsd
|
| 95 |
+
.TemporaryItems
|
| 96 |
+
.VolumeIcon.icns
|
| 97 |
+
.com.apple.timemachine.donotpresent
|
| 98 |
+
|
| 99 |
+
# Directories potentially created on remote AFP share
|
| 100 |
+
.AppleDB
|
| 101 |
+
.AppleDesktop
|
| 102 |
+
Network Trash Folder
|
| 103 |
+
Temporary Items
|
| 104 |
+
.apdisk
|
| 105 |
+
|
| 106 |
+
# Setup artifacts and temporary files
|
| 107 |
+
=*.*
|
| 108 |
+
fix_setup.sh
|
| 109 |
+
install_dependencies.sh
|
| 110 |
+
step_by_step.sh
|
| 111 |
+
simple_test.py
|
| 112 |
+
|
| 113 |
+
# Data
|
| 114 |
+
freshqa/freshqa_prompt.py
|
Dockerfile
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# μμ€ν
ν¨ν€μ§ μ
λ°μ΄νΈ λ° νμμ‘΄ μ€μ
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
git \
|
| 8 |
+
curl \
|
| 9 |
+
tzdata \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# νμμ‘΄μ Asia/Seoulλ‘ μ€μ
|
| 13 |
+
ENV TZ=Asia/Seoul
|
| 14 |
+
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
| 15 |
+
|
| 16 |
+
# Python μμ‘΄μ± μ€μΉ
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 19 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# μ ν리μΌμ΄μ
νμΌ λ³΅μ¬
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# κΆν μ€μ
|
| 25 |
+
RUN chmod +x quick_start.sh
|
| 26 |
+
|
| 27 |
+
# νκ²½λ³μ μ€μ
|
| 28 |
+
ENV PYTHONPATH=/app
|
| 29 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
| 30 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 31 |
+
|
| 32 |
+
# ν¬νΈ λ
ΈμΆ
|
| 33 |
+
EXPOSE 7860
|
| 34 |
+
|
| 35 |
+
# κΈ°λ³Έ λͺ
λ Ήμ΄
|
| 36 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Ko-FreshQA Leaderboard
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## Ko-FreshQA Leaderboard
|
| 13 |
+
νκ΅μ΄ FreshQA κΈ°λ° μλ νκ°/리λ보λ μμ€ν
μ
λλ€. μ°Έκ°μκ° μ
λ‘λν CSVμ `model_response`λ₯Ό κΈ°μ€ λ°μ΄ν°μ λ§€μΉνκ³ , Upstage Solar λͺ¨λΈλ‘ Relaxed/Strict νκ°λ₯Ό μνν λ€ κ²°κ³Όλ₯Ό 리λ보λμ λ°μν©λλ€. Gradio UIλ‘ μ€νλ©λλ€.
|
| 14 |
+
|
| 15 |
+
### ν΅μ¬ κΈ°λ₯
|
| 16 |
+
- λ°μ΄ν°μ
λ°°ν¬: DEV/TEST CSV λ€μ΄λ‘λ ν μ 곡
|
| 17 |
+
- μ μΆ λ° μλ νκ°: μ
λ‘λλ CSVλ₯Ό λ³ν© β νκ° β μ§ν μ§κ³ β 리λ보λ λ°μ
|
| 18 |
+
- μμΈ μ§ν: fact type, μ μ μ ν¨μ±(vp/fp), hop(one/multi), μ°λ(old/new), λλ©μΈλ³ μ νλ
|
| 19 |
+
- μ μΆ μ ν(μ΅μ
): μ¬μ©μλ³ ν루 3ν μ ν κΈ°λ₯ (Hugging Face μ μ₯μ κΈ°λ°)
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## λλ ν°λ¦¬ ꡬ쑰
|
| 24 |
+
- `app.py`: Gradio μ± μ΄κΈ°ν λ° ν ꡬμ±
|
| 25 |
+
- `config.py`: νκ²½λ³μ λ‘λ λ° νμ μ€μ κ²μ¦
|
| 26 |
+
- `freshqa/`
|
| 27 |
+
- `fresheval.py`: λ¨μΌ μν νκ° λ‘μ§
|
| 28 |
+
- `fresheval_parallel.py`: λ°μ΄ν°νλ μ λ³λ ¬ νκ° λνΌ
|
| 29 |
+
- `freshqa_acc.py`: νκ° κ²°κ³Ό μ§κ³(μ νλ κ³μ° λ° λλ©μΈλ³ ν΅κ³)
|
| 30 |
+
- `merge_csv_with_model_response.py`: κΈ°μ€ λ°μ΄ν°μ μ¬μ©μ CSV λ³ν©
|
| 31 |
+
- `src/`
|
| 32 |
+
- `submission_handler.py`: μ μΆλΆν° 리λ보λ λ°μκΉμ§ μ 체 μ€μΌμ€νΈλ μ΄μ
|
| 33 |
+
- `submission_tracker.py`: μ μΆ μ΄λ ₯ μΆμ (HF repo κΈ°λ°, μ΅μ
)
|
| 34 |
+
- `leaderboard_manager.py`: 리λ보λ CSV λ‘λ/μ μ₯/νμμ© μ 리
|
| 35 |
+
- `quick_csv_loader.py`, `hf_private_csv_loader.py`: HF Private repoμμ CSV λ‘λ μ νΈ
|
| 36 |
+
- `api_key_rotator.py`, `utils.py`: μ νΈλ¦¬ν°
|
| 37 |
+
- `ui/`
|
| 38 |
+
- `leaderboard_tab.py`, `submission_tab.py`, `dataset_tab.py`, `styles.css`
|
| 39 |
+
- `data/leaderboard_results.csv`: 리λ보λ λμ λ°μ΄ν°
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
## μꡬ μ¬ν
|
| 44 |
+
- Python 3.10
|
| 45 |
+
- Upstage API ν€(λ¨μΌ λλ λ€μ€)
|
| 46 |
+
- Hugging Face ν ν°(HF Private repo μ κ·Όμ©)
|
| 47 |
+
- Hugging Face Dataset repo
|
| 48 |
+
- κΈ°μ€ λ°μ΄ν°: `FRESHQA_DATA_REPO_ID` / `FRESHQA_DATA_FILENAME`
|
| 49 |
+
- (μ΅μ
) μ μΆ μΆμ μ μ₯μ: `SUBMISSION_TRACKER_REPO_ID`
|
| 50 |
+
|
| 51 |
+
μ€μΉ:
|
| 52 |
+
```bash
|
| 53 |
+
python -m venv venv && source venv/bin/activate
|
| 54 |
+
pip install -r requirements.txt
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
λλ Conda:
|
| 58 |
+
```bash
|
| 59 |
+
conda env create -f environment.yml
|
| 60 |
+
conda activate freshqa-leaderboard
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## νκ²½ λ³μ(.env)
|
| 66 |
+
`env.example`λ₯Ό `.env`λ‘ λ³΅μ¬ ν κ° μ±μ°κΈ°:
|
| 67 |
+
```bash
|
| 68 |
+
cp env.example .env
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
νμ/μ£Όμ λ³μ
|
| 72 |
+
- HF_TOKEN
|
| 73 |
+
- FRESHQA_DATA_REPO_ID
|
| 74 |
+
- FRESHQA_DATA_FILENAME (κΈ°λ³Έκ°: ko-freshqa_2025_total.csv)
|
| 75 |
+
- UPSTAGE_API_KEY λλ UPSTAGE_API_KEYS(μ½€λ§ κ΅¬λΆ)
|
| 76 |
+
- ENABLE_SUBMISSION_LIMIT (κΈ°λ³Έ: true)
|
| 77 |
+
- SUBMISSION_TRACKER_REPO_ID (μ μΆ μ ν μ¬μ© μ νμ)
|
| 78 |
+
|
| 79 |
+
κ²μ¦: μ± μμ μ `Config.validate_required_configs()`κ° λλ½λ νμ μ€μ μ κ²μ¬ν©λλ€.
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
## μ€ν
|
| 84 |
+
λ‘컬:
|
| 85 |
+
```bash
|
| 86 |
+
python app.py
|
| 87 |
+
```
|
| 88 |
+
κΈ°λ³Έ ν¬νΈ: 7860
|
| 89 |
+
|
| 90 |
+
Hugging Face Spaces:
|
| 91 |
+
- νκ²½λ³μ `SPACE_ID`κ° μ‘΄μ¬νλ©΄ Spaces λͺ¨λλ‘ λμν©λλ€.
|
| 92 |
+
|
| 93 |
+
Docker(μ΅μ
):
|
| 94 |
+
- `Dockerfile`, `docker-compose.yml` μ 곡 (νμ μ μ€μ μ λ§κ² μμ )
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## μ¬μ© λ°©λ²(Gradio UI)
|
| 99 |
+
1) λ°μ΄ν°μ
ν
|
| 100 |
+
- DEV/TEST CSV λ€μ΄λ‘λ
|
| 101 |
+
|
| 102 |
+
2) μ μΆ λ° νκ° ν
|
| 103 |
+
- μ
λ‘λ: TEST CSVμ `model_response`κ° μ±μμ§ νμΌ
|
| 104 |
+
- μ
λ ₯: μ μΆμ μ΄λ¦, μ¬μ© λͺ¨λΈ, μ€λͺ
|
| 105 |
+
- νκ°: Upstage Solar λͺ¨λΈλ‘ Relaxed/Strict λμ μν
|
| 106 |
+
- μΆλ ₯: μ 체/μΈλΆ μ§νκ° κ³μ°λμ΄ λ¦¬λ보λμ λ°μ
|
| 107 |
+
|
| 108 |
+
3) 리λ보λ ν
|
| 109 |
+
- μ μΆ κ²°κ³Όκ° `data/leaderboard_results.csv`μ λμ
|
| 110 |
+
- κ²μ/μλ‘κ³ μΉ¨ κ°λ₯
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## λμ νλ¦(λ΄λΆ)
|
| 115 |
+
1) μ μΆ μ μ: `src/submission_handler.py::process_submission`
|
| 116 |
+
2) μ¬μ©μ CSV λ‘λ β κΈ°μ€ λ°μ΄ν°μ λ³ν©:
|
| 117 |
+
- `freshqa/merge_csv_with_model_response.py::merge_dataframe_with_model_response_df`
|
| 118 |
+
3) νκ°:
|
| 119 |
+
- `freshqa/fresheval_parallel.py::evaluate_dataframe` β `freshqa/fresheval.py::FreshEval`
|
| 120 |
+
4) μ νλ μ§κ³:
|
| 121 |
+
- `freshqa/freshqa_acc.py::calculate_accuracy`, `process_freshqa_dataframe`
|
| 122 |
+
5) μ μ₯:
|
| 123 |
+
- 리λ보λ: `src/leaderboard_manager.py::append_to_leaderboard_data`
|
| 124 |
+
- (μ΅μ
) μ μΆ μ΄λ ₯: `src/submission_tracker.py` (ENABLE_SUBMISSION_LIMIT=true μΌ λλ§)
|
| 125 |
+
|
| 126 |
+
μ£Όμ: `ENABLE_SUBMISSION_LIMIT=false`μΈ κ²½μ°, μ μΆ μ΄λ ₯ μΆμ μ© Hugging Face μ μ₯μ μ κ·Όμ μλνμ§ μλλ‘ μ½λκ° λ°μλμ΄ μμ΅λλ€.
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## μ μΆ μ ν(μ΅μ
)
|
| 131 |
+
- μ€μ : `ENABLE_SUBMISSION_LIMIT=true`(κΈ°λ³Έ)
|
| 132 |
+
- μ μ₯μ: `SUBMISSION_TRACKER_REPO_ID`μ `user_submissions.json` κ΄λ¦¬
|
| 133 |
+
- λ‘μ§:
|
| 134 |
+
- ν μ¬μ©μ ν루 3ν μ±κ³΅ μ μΆκΉμ§ μΉ΄μ΄νΈ
|
| 135 |
+
- νκ΅ μκ° κΈ°μ€ 00:00μ μΌμ λ¨μλ‘ μΉ΄μ΄νΈ
|
| 136 |
+
- λΉνμ±ν μ(HF μ μ₯μ μ κ·Ό μμ): `SubmissionHandler`κ° μΆμ κΈ°λ₯Ό μμ±νμ§ μμ
|
| 137 |
+
|
| 138 |
+
---
|
| 139 |
+
|
| 140 |
+
## νΈλ¬λΈμν
|
| 141 |
+
- μμ μ βνμ μ€μ λλ½β μ€λ₯
|
| 142 |
+
- `.env`μμ `UPSTAGE_API_KEY(or KEYS)`, `HF_TOKEN`, `FRESHQA_DATA_REPO_ID` νμΈ
|
| 143 |
+
- μ μΆ μ ν λΉνμ±νμΈλ° HF 404 κ²½κ³ κ° λ³΄μ
|
| 144 |
+
- ν λ²μ μ `ENABLE_SUBMISSION_LIMIT=false`μΌ λ μ μΆ μΆμ κΈ°λ₯Ό μ΄κΈ°ννμ§ μλλ‘ μμ λ¨
|
| 145 |
+
- HF 404 (μ μΆ μ ν νμ±ν)
|
| 146 |
+
- `SUBMISSION_TRACKER_REPO_ID` μ μ₯μμ `user_submissions.json`μ΄ μμΌλ©΄ μ΅μ΄ μ κ·Ό μ 404κ° λ μ μμ΅λλ€. νμΌμ λΉ JSON `{}`μΌλ‘ μμ±ν΄ λμΈμ.
|
| 147 |
+
|
| 148 |
+
---
|
| 149 |
+
|
| 150 |
+
## λΌμ΄μ μ€/μΆμ²
|
| 151 |
+
- λ³Έ 리λ보λλ FreshQAμμ μκ°μ λ°μ μ μλμμ΅λλ€.
|
| 152 |
+
|
| 153 |
+
λ¬Έμ μ¬νμ μ΄μλ‘ λ±λ‘ν΄ μ£ΌμΈμ.
|
app.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ko-FreshQA Leaderboard λ©μΈ μ ν리μΌμ΄μ
|
| 3 |
+
|
| 4 |
+
Gradio κΈ°λ°μ μΉ μΈν°νμ΄μ€λ₯Ό μ 곡ν©λλ€.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from config import Config
|
| 10 |
+
from ui.leaderboard_tab import create_leaderboard_tab
|
| 11 |
+
from ui.submission_tab import create_submission_tab
|
| 12 |
+
from ui.dataset_tab import create_dataset_tab
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load_css():
|
| 16 |
+
"""CSS νμΌ λ‘λ"""
|
| 17 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
+
css_path = os.path.join(current_dir, 'ui', 'styles.css')
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
with open(css_path, 'r', encoding='utf-8') as f:
|
| 22 |
+
return f.read()
|
| 23 |
+
except FileNotFoundError:
|
| 24 |
+
print("β οΈ CSS νμΌμ μ°Ύμ μ μμ΅λλ€: {css_path}")
|
| 25 |
+
raise FileNotFoundError(f"CSS νμΌμ μ°Ύμ μ μμ΅λλ€: {css_path}")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def create_interface():
|
| 29 |
+
"""λ©μΈ μΈν°νμ΄μ€ μμ±"""
|
| 30 |
+
|
| 31 |
+
css_content = load_css()
|
| 32 |
+
|
| 33 |
+
with gr.Blocks(
|
| 34 |
+
title="Ko-FreshQA Leaderboard",
|
| 35 |
+
theme=gr.themes.Soft(),
|
| 36 |
+
css=css_content
|
| 37 |
+
) as app:
|
| 38 |
+
gr.Markdown("# Ko-FreshQA Leaderboard")
|
| 39 |
+
|
| 40 |
+
with gr.Tabs():
|
| 41 |
+
# 리λ보λ ν
|
| 42 |
+
with gr.Tab("π 리λ보λ"):
|
| 43 |
+
create_leaderboard_tab()
|
| 44 |
+
|
| 45 |
+
# μ μΆ λ° νκ° ν
|
| 46 |
+
with gr.Tab("π€ μ μΆ λ° νκ°"):
|
| 47 |
+
create_submission_tab()
|
| 48 |
+
|
| 49 |
+
# λ°μ΄ν°μ
λ€μ΄λ‘λ ν
|
| 50 |
+
with gr.Tab("πΎ λ°μ΄ν°μ
"):
|
| 51 |
+
create_dataset_tab()
|
| 52 |
+
|
| 53 |
+
return app
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def main():
|
| 57 |
+
"""λ©μΈ μ€ν ν¨μ"""
|
| 58 |
+
print("π°π· νκ΅μ΄ FreshQA 리λ보λ μμ μ€...")
|
| 59 |
+
print("π 리λ보λ μ μΆμ μν΄μλ 'π€ μ μΆ λ° νκ°' νμ μ¬μ©νμΈμ.")
|
| 60 |
+
|
| 61 |
+
# νμ μ€μ κ²μ¦
|
| 62 |
+
try:
|
| 63 |
+
Config.validate_required_configs()
|
| 64 |
+
print("β
νμ μ€μ κ²μ¦ μλ£")
|
| 65 |
+
except ValueError as e:
|
| 66 |
+
print(f"β μ€μ μ€λ₯: {e}")
|
| 67 |
+
import sys
|
| 68 |
+
sys.exit(1)
|
| 69 |
+
|
| 70 |
+
app = create_interface()
|
| 71 |
+
|
| 72 |
+
# Hugging Face Spaces νκ²½ κ°μ§
|
| 73 |
+
is_huggingface_spaces = Config.IS_HUGGINGFACE_SPACES
|
| 74 |
+
|
| 75 |
+
if is_huggingface_spaces:
|
| 76 |
+
print("π Hugging Face Spaces νκ²½μμ μ€ν μ€...")
|
| 77 |
+
app.launch(
|
| 78 |
+
server_name="0.0.0.0",
|
| 79 |
+
server_port=7860,
|
| 80 |
+
share=False,
|
| 81 |
+
debug=False,
|
| 82 |
+
show_error=True
|
| 83 |
+
)
|
| 84 |
+
else:
|
| 85 |
+
print("π» λ‘컬 νκ²½μμ μ€ν μ€...")
|
| 86 |
+
app.launch(
|
| 87 |
+
server_name="127.0.0.1",
|
| 88 |
+
server_port=7860,
|
| 89 |
+
share=False,
|
| 90 |
+
debug=True,
|
| 91 |
+
show_error=True
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
μ€μ κ΄λ¦¬ λͺ¨λ
|
| 3 |
+
μ ν리μΌμ΄μ
μ λͺ¨λ μ€μ μ μ€μμμ κ΄λ¦¬ν©λλ€.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
# .env νμΌ λ‘λ
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
class Config:
|
| 13 |
+
"""μ ν리μΌμ΄μ
μ€μ ν΄λμ€"""
|
| 14 |
+
|
| 15 |
+
# API μ€μ
|
| 16 |
+
# λ©ν° ν€ μ§μ: UPSTAGE_API_KEYS(μ½€λ§ κ΅¬λΆ) μ°μ , μμΌλ©΄ λ¨μΌ ν€ μ¬μ©
|
| 17 |
+
_UPSTAGE_API_KEYS_RAW = os.getenv('UPSTAGE_API_KEYS')
|
| 18 |
+
if _UPSTAGE_API_KEYS_RAW:
|
| 19 |
+
_parsed_keys = [k.strip() for k in _UPSTAGE_API_KEYS_RAW.split(',') if k.strip()]
|
| 20 |
+
else:
|
| 21 |
+
_single = os.getenv('UPSTAGE_API_KEY')
|
| 22 |
+
_parsed_keys = [
|
| 23 |
+
_single.strip()
|
| 24 |
+
] if (_single and _single.strip()) else []
|
| 25 |
+
|
| 26 |
+
# κ³΅κ° μμ±: νΈνμ±μ μν΄ μ²« λ²μ§Έ ν€λ₯Ό κΈ°μ‘΄ μ΄λ¦μΌλ‘ λ
ΈμΆ
|
| 27 |
+
UPSTAGE_API_KEYS = _parsed_keys
|
| 28 |
+
UPSTAGE_API_KEY = _parsed_keys[0] if _parsed_keys else None
|
| 29 |
+
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 30 |
+
|
| 31 |
+
# λ°μ΄ν° μ€μ
|
| 32 |
+
FRESHQA_DATA_REPO_ID = os.getenv('FRESHQA_DATA_REPO_ID')
|
| 33 |
+
FRESHQA_DATA_FILENAME = os.getenv('FRESHQA_DATA_FILENAME', 'ko-freshqa_2025_total.csv')
|
| 34 |
+
|
| 35 |
+
# ν둬ννΈ μ€μ
|
| 36 |
+
# λ©ν°λΌμΈ κ° μ§μ: python-dotenvκ° λ‘λν μλ¬Έμ κ·Έλλ‘ μ¬μ©
|
| 37 |
+
FRESHQA_PROMPT_RELAXED = os.getenv('FRESHQA_PROMPT_RELAXED')
|
| 38 |
+
FRESHQA_PROMPT_STRICT = os.getenv('FRESHQA_PROMPT_STRICT')
|
| 39 |
+
try:
|
| 40 |
+
from freshqa.freshqa_prompt import FRESHQA_PROMPT_RELAXED, FRESHQA_PROMPT_STRICT
|
| 41 |
+
if not FRESHQA_PROMPT_RELAXED:
|
| 42 |
+
FRESHQA_PROMPT_RELAXED = FRESHQA_PROMPT_RELAXED
|
| 43 |
+
if not FRESHQA_PROMPT_STRICT:
|
| 44 |
+
FRESHQA_PROMPT_STRICT = FRESHQA_PROMPT_STRICT
|
| 45 |
+
except Exception as e:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
# μ μΆ μ ν μ€μ
|
| 49 |
+
ENABLE_SUBMISSION_LIMIT = os.getenv('ENABLE_SUBMISSION_LIMIT', 'true').lower() == 'true'
|
| 50 |
+
SUBMISSION_TRACKER_REPO_ID = os.getenv('SUBMISSION_TRACKER_REPO_ID')
|
| 51 |
+
|
| 52 |
+
# νκ²½ μ€μ
|
| 53 |
+
IS_HUGGINGFACE_SPACES = os.getenv("SPACE_ID") is not None
|
| 54 |
+
|
| 55 |
+
@classmethod
|
| 56 |
+
def validate_required_configs(cls):
|
| 57 |
+
"""νμ μ€μ λ€μ΄ λͺ¨λ μλμ§ νμΈ"""
|
| 58 |
+
missing_configs = []
|
| 59 |
+
|
| 60 |
+
# λ©ν°/λ¨μΌ ν€ λͺ¨λ νμ©: μ΅μ 1κ° ν€κ° μ‘΄μ¬ν΄μΌ ν¨
|
| 61 |
+
if not cls.UPSTAGE_API_KEYS:
|
| 62 |
+
# λ©μμ§λ λ λ³μ λͺ¨λ μλ΄
|
| 63 |
+
missing_configs.append('UPSTAGE_API_KEY or UPSTAGE_API_KEYS')
|
| 64 |
+
if not cls.HF_TOKEN:
|
| 65 |
+
missing_configs.append('HF_TOKEN')
|
| 66 |
+
if not cls.FRESHQA_DATA_REPO_ID:
|
| 67 |
+
missing_configs.append('FRESHQA_DATA_REPO_ID')
|
| 68 |
+
if not cls.FRESHQA_PROMPT_RELAXED:
|
| 69 |
+
missing_configs.append('FRESHQA_PROMPT_RELAXED')
|
| 70 |
+
if not cls.FRESHQA_PROMPT_STRICT:
|
| 71 |
+
missing_configs.append('FRESHQA_PROMPT_STRICT')
|
| 72 |
+
|
| 73 |
+
if missing_configs:
|
| 74 |
+
raise ValueError(f"νμ μ€μ μ΄ λλ½λμμ΅λλ€: {', '.join(missing_configs)}")
|
| 75 |
+
|
| 76 |
+
return True
|
data/leaderboard_results.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
id,model,description,accuracy,fast_changing_accuracy,slow_changing_accuracy,never_changing_accuracy,acc_vp,acc_fp,acc_vp_one_hop,acc_vp_two_hop,acc_fp_one_hop,acc_fp_two_hop,acc_vp_old,acc_vp_new,acc_fp_old,acc_fp_new,acc_politics,acc_sports,acc_entertainment,acc_weather,acc_world,acc_economy,acc_society,acc_it_science,acc_life_culture,acc_unknown,total_questions,evaluation_date,evaluation_mode
|
data/public/ko-freshqa_2025_dev.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/public/ko-freshqa_2025_test.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
freshqa-leaderboard:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "7860:7860"
|
| 8 |
+
environment:
|
| 9 |
+
- UPSTAGE_API_KEY=${UPSTAGE_API_KEY}
|
| 10 |
+
volumes:
|
| 11 |
+
- ./datasets:/app/datasets
|
| 12 |
+
- ./results:/app/results
|
| 13 |
+
restart: unless-stopped
|
| 14 |
+
healthcheck:
|
| 15 |
+
test: ["CMD", "curl", "-f", "http://localhost:7860/"]
|
| 16 |
+
interval: 30s
|
| 17 |
+
timeout: 10s
|
| 18 |
+
retries: 3
|
| 19 |
+
start_period: 40s
|
env.example
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ===========================================
|
| 2 |
+
# FreshQA Leaderboard νκ²½λ³μ μ€μ μμ
|
| 3 |
+
# ===========================================
|
| 4 |
+
|
| 5 |
+
# ===========================================
|
| 6 |
+
# Hugging Face μ€μ
|
| 7 |
+
# ===========================================
|
| 8 |
+
# Hugging Face API ν ν° (νμ)
|
| 9 |
+
# - Private repository μ κ·Όμ©
|
| 10 |
+
# - https://huggingface.co/settings/tokens μμ λ°κΈ
|
| 11 |
+
HF_TOKEN=your_huggingface_token_here
|
| 12 |
+
|
| 13 |
+
# ===========================================
|
| 14 |
+
# FreshQA κΈ°μ€ λ°μ΄ν° μ€μ
|
| 15 |
+
# ===========================================
|
| 16 |
+
# κΈ°μ€ λ°μ΄ν°κ° μλ HuggingFace Repository ID (νμ)
|
| 17 |
+
# νμ: username/repository-name
|
| 18 |
+
FRESHQA_DATA_REPO_ID=james-demo-leaderboard-backend/kofreshqa-data-origin
|
| 19 |
+
|
| 20 |
+
# κΈ°μ€ λ°μ΄ν° νμΌλͺ
(μ νμ¬ν, κΈ°λ³Έκ°: ko-freshqa_2025_total.csv)
|
| 21 |
+
FRESHQA_DATA_FILENAME=ko-freshqa_2025_total.csv
|
| 22 |
+
|
| 23 |
+
# ===========================================
|
| 24 |
+
# FreshQA ν둬ννΈ μ€μ
|
| 25 |
+
# ===========================================
|
| 26 |
+
# Fresheval ν둬ννΈ λ³Έλ¬Έ
|
| 27 |
+
# νκ° ν둬ννΈλ μ 곡νμ§ μμ΅λλ€.
|
| 28 |
+
FRESHQA_PROMPT_RELAXED=
|
| 29 |
+
FRESHQA_PROMPT_STRICT=
|
| 30 |
+
|
| 31 |
+
# ===========================================
|
| 32 |
+
# μ μΆ μΆμ μ€μ
|
| 33 |
+
# ===========================================
|
| 34 |
+
# μ μΆ κΈ°λ‘μ μ μ₯ν HuggingFace Repository ID (νμ)
|
| 35 |
+
# νμ: username/repository-name
|
| 36 |
+
SUBMISSION_TRACKER_REPO_ID=james-demo-leaderboard-backend/submission-tracker
|
| 37 |
+
|
| 38 |
+
# μ μΆ μ ν κΈ°λ₯ νμ±ν μ¬λΆ (μ νμ¬ν, κΈ°λ³Έκ°: true)
|
| 39 |
+
# - true: μ μΆ μ ν κΈ°λ₯ νμ±ν (ν루 3ν μ ν)
|
| 40 |
+
# - false: μ μΆ μ ν κΈ°λ₯ λΉνμ±ν (λ‘컬 ν
μ€νΈμ©)
|
| 41 |
+
ENABLE_SUBMISSION_LIMIT=true
|
| 42 |
+
|
| 43 |
+
# ===========================================
|
| 44 |
+
# AI νκ° API μ€μ
|
| 45 |
+
# ===========================================
|
| 46 |
+
# Upstage Solar Pro API ν€ (νμ)
|
| 47 |
+
# - λͺ¨λΈ νκ°μ©
|
| 48 |
+
# - https://console.upstage.ai/ μμ λ°κΈ
|
| 49 |
+
UPSTAGE_API_KEY=your_upstage_api_key_here
|
| 50 |
+
|
| 51 |
+
# μ¬λ¬ κ°μ Upstage API ν€λ₯Ό μ¬μ©ν κ²½μ°(μ νμ¬ν)
|
| 52 |
+
# - μ½€λ§λ‘ ꡬλΆνμ¬ μ
λ ₯
|
| 53 |
+
# - μ€μ λ‘λλ UPSTAGE_API_KEYSκ° μ‘΄μ¬νλ©΄ μ΄λ₯Ό μ°μ μ¬μ©νκ³ ,
|
| 54 |
+
# μμΌλ©΄ λ¨μΌ λ³μ UPSTAGE_API_KEYλ₯Ό μ¬μ©ν©λλ€.
|
| 55 |
+
# μμ)
|
| 56 |
+
# UPSTAGE_API_KEYS=keyA,keyB,keyC
|
| 57 |
+
|
| 58 |
+
# ===========================================
|
| 59 |
+
# μ ν리μΌμ΄μ
μ€μ
|
| 60 |
+
# ===========================================
|
| 61 |
+
# μλ² ν¬νΈ (μ νμ¬ν, κΈ°λ³Έκ°: 7860)
|
| 62 |
+
# PORT=7860
|
| 63 |
+
|
| 64 |
+
# λλ²κ·Έ λͺ¨λ (μ νμ¬ν, κΈ°λ³Έκ°: false)
|
| 65 |
+
# DEBUG=false
|
| 66 |
+
|
| 67 |
+
# ===========================================
|
| 68 |
+
# μ¬μ© μμ
|
| 69 |
+
# ===========================================
|
| 70 |
+
# 1. μ΄ νμΌμ .envλ‘ λ³΅μ¬νμΈμ:
|
| 71 |
+
# cp env.example .env
|
| 72 |
+
#
|
| 73 |
+
# 2. μ€μ κ°μΌλ‘ λ³κ²½νμΈμ:
|
| 74 |
+
# - HF_TOKEN: μ€μ HuggingFace ν ν°
|
| 75 |
+
# - FRESHQA_DATA_REPO_ID: μ€μ repository ID
|
| 76 |
+
# - FRESHQA_DATA_FILENAME: μ€μ νμΌλͺ
(μ νμ¬ν)
|
| 77 |
+
# - SUBMISSION_TRACKER_REPO_ID: μ μΆ μΆμ μ© repository ID
|
| 78 |
+
# - ENABLE_SUBMISSION_LIMIT: μ μΆ μ ν κΈ°λ₯ νμ±ν μ¬λΆ (λ‘컬 ν
μ€νΈ μ false)
|
| 79 |
+
# - UPSTAGE_API_KEY: μ€μ Upstage API ν€
|
| 80 |
+
#
|
| 81 |
+
# 3. Pythonμμ μλ λ‘λλ¨ (app.pyμμ load_dotenv() νΈμΆ)
|
| 82 |
+
# λλ μλμΌλ‘ λ‘λ:
|
| 83 |
+
# from dotenv import load_dotenv
|
| 84 |
+
# load_dotenv()
|
| 85 |
+
#
|
| 86 |
+
# 4. λλ μ§μ νκ²½λ³μ μ€μ (μμ€ν
λ 벨):
|
| 87 |
+
# export HF_TOKEN="your_token"
|
| 88 |
+
# export FRESHQA_DATA_REPO_ID="username/repo"
|
| 89 |
+
# export FRESHQA_DATA_FILENAME="filename.csv"
|
| 90 |
+
# export UPSTAGE_API_KEY="your_api_key"
|
| 91 |
+
|
| 92 |
+
# ===========================================
|
| 93 |
+
# 보μ μ£Όμμ¬ν
|
| 94 |
+
# ===========================================
|
| 95 |
+
# - .env νμΌμ μ λ Gitμ 컀λ°νμ§ λ§μΈμ
|
| 96 |
+
# - μ€μ ν ν°κ³Ό API ν€λ μμ νκ² λ³΄κ΄νμΈμ
|
| 97 |
+
# - νλ‘λμ
νκ²½μμλ νκ²½λ³μλ‘ μ§μ μ€μ νμΈμ
|
environment.yml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: freshqa-leaderboard
|
| 2 |
+
channels:
|
| 3 |
+
- conda-forge
|
| 4 |
+
- defaults
|
| 5 |
+
dependencies:
|
| 6 |
+
- python=3.9
|
| 7 |
+
- pip
|
| 8 |
+
- numpy>=1.24.0
|
| 9 |
+
- pandas>=2.0.0
|
| 10 |
+
- requests>=2.25.0
|
| 11 |
+
- pip:
|
| 12 |
+
- gradio>=5.0.0
|
| 13 |
+
- plotly>=5.0.0
|
| 14 |
+
- pytz>=2023.3
|
| 15 |
+
- python-dateutil>=2.8.0
|
| 16 |
+
- openpyxl>=3.0.0
|
| 17 |
+
- httpx>=0.24.0
|
| 18 |
+
- seaborn>=0.12.0
|
| 19 |
+
- matplotlib>=3.7.0
|
| 20 |
+
- tqdm>=4.65.0
|
| 21 |
+
- huggingface_hub<1.0.0
|
freshqa/fresheval.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
from typing import List, Dict, Any, Tuple
|
| 5 |
+
import time
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
from config import Config
|
| 9 |
+
from src.utils import get_current_date_str
|
| 10 |
+
|
| 11 |
+
class FreshEval:
|
| 12 |
+
|
| 13 |
+
def __init__(self, model: str='solar-pro2', api_key: str=None):
|
| 14 |
+
self.model = model
|
| 15 |
+
self.api_key = api_key or Config.UPSTAGE_API_KEY
|
| 16 |
+
self.client = OpenAI(
|
| 17 |
+
api_key=self.api_key,
|
| 18 |
+
base_url="https://api.upstage.ai/v1/solar"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
self.temperature = 0.0
|
| 22 |
+
self.max_tokens = 256
|
| 23 |
+
self.chat_completions = True
|
| 24 |
+
|
| 25 |
+
if model.startswith('gpt-4') | model.startswith('solar'):
|
| 26 |
+
self.num_organic_results = 15
|
| 27 |
+
self.num_related_questions = 3
|
| 28 |
+
self.num_questions_and_answers = 3
|
| 29 |
+
self.num_retrieved_evidences = 15
|
| 30 |
+
else:
|
| 31 |
+
self.num_organic_results = 15
|
| 32 |
+
self.num_related_questions = 2
|
| 33 |
+
self.num_questions_and_answers = 2
|
| 34 |
+
self.num_retrieved_evidences = 5
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _is_rate_limit_error(self, error: Exception) -> bool:
|
| 38 |
+
"""429 μλ¬ κ°μ§ ν¨μ"""
|
| 39 |
+
error_str = str(error)
|
| 40 |
+
error_type = type(error).__name__
|
| 41 |
+
|
| 42 |
+
# 1. HTTP μν μ½λ νμΈ
|
| 43 |
+
if hasattr(error, 'response') and hasattr(error.response, 'status_code'):
|
| 44 |
+
if error.response.status_code == 429:
|
| 45 |
+
print(f"β
HTTP 429 μλ¬ κ°μ§: {error.response.status_code}")
|
| 46 |
+
return True
|
| 47 |
+
|
| 48 |
+
# 2. ν
μ€νΈ κΈ°λ° κ°μ§ (λ°±μ
)
|
| 49 |
+
error_lower = error_str.lower()
|
| 50 |
+
if ("429" in error_lower or
|
| 51 |
+
"rate" in error_lower or
|
| 52 |
+
"limit" in error_lower or
|
| 53 |
+
"too_many_requests" in error_lower or
|
| 54 |
+
"request limit" in error_lower):
|
| 55 |
+
# print(f"β
ν
μ€νΈ κΈ°λ° 429 μλ¬ κ°μ§")
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def call_llm_api(self, prompt:str, current_date:str) -> str:
|
| 62 |
+
"""LLM API νΈμΆ ν¨μ (ν€ νμ λ° λ°±μ€ν μ§μ)"""
|
| 63 |
+
from src.api_key_rotator import get_rotator
|
| 64 |
+
|
| 65 |
+
rotator = get_rotator()
|
| 66 |
+
num_keys = len(rotator.keys)
|
| 67 |
+
base_delay = 3.0
|
| 68 |
+
|
| 69 |
+
def _make_api_call(eval_instance: FreshEval) -> str:
|
| 70 |
+
"""API νΈμΆ ν¬νΌ ν¨μ"""
|
| 71 |
+
if eval_instance.chat_completions:
|
| 72 |
+
# Chat completions API
|
| 73 |
+
response = eval_instance.client.chat.completions.create(
|
| 74 |
+
model=eval_instance.model,
|
| 75 |
+
temperature=eval_instance.temperature,
|
| 76 |
+
max_tokens=eval_instance.max_tokens,
|
| 77 |
+
messages=[
|
| 78 |
+
{
|
| 79 |
+
"role": "system",
|
| 80 |
+
"content": (
|
| 81 |
+
f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}."
|
| 82 |
+
)
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"role": "user",
|
| 86 |
+
"content": "What's today's date?"
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"role": "assistant",
|
| 90 |
+
"content": f"Today is {current_date} in Pacific Standard Time."
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"role": "user",
|
| 94 |
+
"content": prompt
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
)
|
| 98 |
+
return response.choices[0].message.content
|
| 99 |
+
else:
|
| 100 |
+
# Completions API
|
| 101 |
+
response = eval_instance.client.completions.create(
|
| 102 |
+
model=eval_instance.model,
|
| 103 |
+
temperature=eval_instance.temperature,
|
| 104 |
+
max_tokens=eval_instance.max_tokens,
|
| 105 |
+
prompt=prompt,
|
| 106 |
+
)
|
| 107 |
+
return response.choices[0].text
|
| 108 |
+
|
| 109 |
+
# νμ¬ ν€λ‘ μμ
|
| 110 |
+
current_key = self.api_key
|
| 111 |
+
current_instance = FreshEval(model=self.model, api_key=current_key)
|
| 112 |
+
|
| 113 |
+
# ν€κ° 1κ°μΈ κ²½μ°: κΈ°μ‘΄ λ°±μ€ν λ‘μ§λ§ μ¬μ©
|
| 114 |
+
if num_keys == 1:
|
| 115 |
+
max_retries = 7
|
| 116 |
+
for attempt in range(max_retries):
|
| 117 |
+
try:
|
| 118 |
+
return _make_api_call(current_instance)
|
| 119 |
+
except Exception as e:
|
| 120 |
+
if self._is_rate_limit_error(e):
|
| 121 |
+
if attempt < max_retries - 1:
|
| 122 |
+
# μ§μμ λ°±μ€ν
|
| 123 |
+
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
| 124 |
+
time.sleep(delay)
|
| 125 |
+
continue
|
| 126 |
+
else:
|
| 127 |
+
print(f"β μ΅λ μ¬μλ νμ μ΄κ³Ό")
|
| 128 |
+
raise e
|
| 129 |
+
|
| 130 |
+
# max_retries μ΄κ³Όν λκΉμ§ return λμ§ μμΌλ©΄ μλ¬ λ°μ
|
| 131 |
+
raise Exception("call llm api:μ΅λ μ¬μλ νμ μ΄κ³Ό")
|
| 132 |
+
|
| 133 |
+
# ν€κ° 2κ° μ΄μμΈ κ²½μ°: ν€ μ ν λ‘μ§ (3μ΄ λκΈ° ν¬ν¨)
|
| 134 |
+
# μ±κ³΅ν λκΉμ§ ν€λ₯Ό μννλ©° μλ (μ΅λ λͺ¨λ ν€λ₯Ό 3λ°ν΄κΉμ§)
|
| 135 |
+
max_attempts = num_keys * 3 # λͺ¨λ ν€λ₯Ό μ΅λ 3λ°ν΄κΉμ§ μλ
|
| 136 |
+
key_attempt_count = 0
|
| 137 |
+
|
| 138 |
+
# νμ¬ ν€λ‘ 첫 μλ
|
| 139 |
+
for attempt in range(max_attempts):
|
| 140 |
+
try:
|
| 141 |
+
return _make_api_call(current_instance) # μ±κ³΅νλ©΄ μ¦μ λ°ν
|
| 142 |
+
except Exception as e:
|
| 143 |
+
if self._is_rate_limit_error(e):
|
| 144 |
+
key_attempt_count += 1
|
| 145 |
+
# λ€μ ν€λ‘ μ ννκΈ° μ μ 2μ΄ λκΈ°
|
| 146 |
+
time.sleep(2)
|
| 147 |
+
current_key = rotator.pick_key()
|
| 148 |
+
# print("π ν€ μ ν")
|
| 149 |
+
current_instance = FreshEval(model=self.model, api_key=current_key)
|
| 150 |
+
continue # λ€μ ν€λ‘ κ³μ μλ
|
| 151 |
+
else:
|
| 152 |
+
# 429κ° μλ μλ¬λ μ¦μ μ ν
|
| 153 |
+
raise
|
| 154 |
+
|
| 155 |
+
# μ΅λ μλ νμ μ΄κ³Ό (λͺ¨λ ν€λ₯Ό μ¬λ¬ λ°ν΄ μλνμ§λ§ λͺ¨λ μ€ν¨)
|
| 156 |
+
raise Exception(f"λͺ¨λ API ν€μμ 429 μλ¬ λ°μ (μ΅λ {max_attempts}ν μλ)")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str:
|
| 160 |
+
"""FreshEval νκ° ν¨μ"""
|
| 161 |
+
|
| 162 |
+
fresheval_question = f'\nquestion: {question}{evaluation}'
|
| 163 |
+
|
| 164 |
+
# νκ²½λ³μ κΈ°λ° ν둬ννΈ(본체: prefix + demo) μ°μ μ¬μ©
|
| 165 |
+
env_prompt_body = None
|
| 166 |
+
if mode == 'Relaxed':
|
| 167 |
+
env_prompt_body = Config.FRESHQA_PROMPT_RELAXED
|
| 168 |
+
elif mode == 'Strict':
|
| 169 |
+
env_prompt_body = Config.FRESHQA_PROMPT_STRICT
|
| 170 |
+
|
| 171 |
+
if env_prompt_body and str(env_prompt_body).strip():
|
| 172 |
+
base_prompt = str(env_prompt_body).strip()
|
| 173 |
+
else:
|
| 174 |
+
raise ValueError(f"{mode} νκ° ν둬ννΈ μ€μ μ΄ μμ΅λλ€.")
|
| 175 |
+
|
| 176 |
+
fresheval_prompt = base_prompt + fresheval_question
|
| 177 |
+
|
| 178 |
+
# νκ°
|
| 179 |
+
answer = self.call_llm_api(fresheval_prompt, current_date)
|
| 180 |
+
|
| 181 |
+
return answer
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]:
|
| 185 |
+
"""νκ° κ²°κ³Όμμ λ±κΈ μΆμΆ"""
|
| 186 |
+
def _clean(text: str) -> str:
|
| 187 |
+
# μλ μ₯μ/곡백 μ κ±° + λ΄λΆ νμ μ 리 + μλ¬Έμν
|
| 188 |
+
text = re.sub(r'^[*`_~\s]+|[*`_~\s]+$', '', text)
|
| 189 |
+
text = re.sub(r'[*`_~]', '', text)
|
| 190 |
+
return text.strip().strip('.').strip().lower()
|
| 191 |
+
|
| 192 |
+
def _judge(val: str):
|
| 193 |
+
"""
|
| 194 |
+
λ¬Έμμ΄μμ correct/incorrect νμ .
|
| 195 |
+
- 'incorrect'κ° λ³΄μ΄λ©΄ 무쑰건 FALSE
|
| 196 |
+
- 'partially correct'λ λͺ¨νΈ β None
|
| 197 |
+
- 'correct'λ TRUE
|
| 198 |
+
"""
|
| 199 |
+
if re.search(r'(?i)\bincorrect\b', val):
|
| 200 |
+
return 'FALSE'
|
| 201 |
+
if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val):
|
| 202 |
+
return None
|
| 203 |
+
if re.search(r'(?i)\bcorrect\b', val):
|
| 204 |
+
return 'TRUE'
|
| 205 |
+
return None
|
| 206 |
+
|
| 207 |
+
def _from_label(block_label: str):
|
| 208 |
+
"""
|
| 209 |
+
λΌλ²¨(μ: 'Final Evaluation' λλ 'Evaluation') κΈ°μ€μΌλ‘
|
| 210 |
+
- κ°μ μ€ μΊ‘μ² λ¨Όμ μλ
|
| 211 |
+
- μ€ν¨νλ©΄ λΌλ²¨ μ΄ν ~ λ€μ λΉ μ€ μ΄μ λ²μμμ νμ ν€μλ νμ
|
| 212 |
+
"""
|
| 213 |
+
# κ°μ μ€ μΊ‘μ²: λΌλ²¨ Β± μ₯μ Β± μ½λ‘ μ΄ν ~ μ€λ
|
| 214 |
+
same_line = re.search(
|
| 215 |
+
rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)\s*([^\r\n]+)',
|
| 216 |
+
response
|
| 217 |
+
)
|
| 218 |
+
if same_line:
|
| 219 |
+
val = _clean(same_line.group(1))
|
| 220 |
+
j = _judge(val)
|
| 221 |
+
if j is not None:
|
| 222 |
+
return j
|
| 223 |
+
|
| 224 |
+
# μμΉλ§ μ°Ύκ³ (κ° μμ΄ μ€λ°κΏλ μΌμ΄μ€), λ€μ λΉ μ€(or μΉμ
) μ κΉμ§ μ€μΊ
|
| 225 |
+
pos = re.search(
|
| 226 |
+
rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)',
|
| 227 |
+
response
|
| 228 |
+
)
|
| 229 |
+
if pos:
|
| 230 |
+
tail = response[pos.end():]
|
| 231 |
+
# λ€μ 'λΉ μ€(μ°μ κ°ν)' λλ λ€μ μΉμ
μμ μ κΉμ§λ§ λ³Έλ€ (λ무 λ©λ¦¬ μκ°λλ‘)
|
| 232 |
+
m_stop = re.search(r'\n\s*\n', tail)
|
| 233 |
+
segment = tail[:m_stop.start()] if m_stop else tail[:300] # μμ ν μν
|
| 234 |
+
seg_clean = _clean(segment)
|
| 235 |
+
j = _judge(seg_clean)
|
| 236 |
+
if j is not None:
|
| 237 |
+
return j
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
# 1) Final Evaluation μ΅μ°μ
|
| 241 |
+
final_judgement = _from_label('final\s+evaluation')
|
| 242 |
+
if final_judgement:
|
| 243 |
+
return True, {'rating': final_judgement}
|
| 244 |
+
|
| 245 |
+
# 2) Evaluation
|
| 246 |
+
eval_judgement = _from_label('evaluation')
|
| 247 |
+
if eval_judgement:
|
| 248 |
+
return True, {'rating': eval_judgement}
|
| 249 |
+
|
| 250 |
+
# 3) ν΄λ°±: credited λ¬Έμ₯
|
| 251 |
+
if re.search(r'(?i)thus,\s*the\s*response\s*is\s*credited\b', response):
|
| 252 |
+
return True, {'rating': 'TRUE'}
|
| 253 |
+
if re.search(r'(?i)thus,\s*the\s*response\s*is\s*not\s*credited\b', response):
|
| 254 |
+
return True, {'rating': 'FALSE'}
|
| 255 |
+
|
| 256 |
+
# 4) μ€ν¨
|
| 257 |
+
return False, {'rating': None}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]:
|
| 261 |
+
"""λ¨μΌ ν νκ°"""
|
| 262 |
+
question = row['question']
|
| 263 |
+
response = row['model_response']
|
| 264 |
+
correct_answers = [row[f'answer_{i}'] for i in range(10)]
|
| 265 |
+
correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()]
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
# model_responseκ° λΉμ΄μκ±°λ NaNμΈ κ²½μ° λ°λ‘ νλ Έλ€λ κ²°κ³Όλ‘ μ²λ¦¬νκ³ return
|
| 269 |
+
if pd.isna(response) or (isinstance(response, str) and response.strip() == ''):
|
| 270 |
+
# print('model_responseκ° λΉμ΄μμ. rating=0μΌλ‘ μ²λ¦¬')
|
| 271 |
+
row_dict = row.to_dict()
|
| 272 |
+
row_dict['rating'] = 0
|
| 273 |
+
row_dict['explanation'] = "model_responseκ° λΉμ΄μμ"
|
| 274 |
+
return row_dict
|
| 275 |
+
|
| 276 |
+
# νκ° ν
νλ¦Ώ μμ±
|
| 277 |
+
evaluation_template = (
|
| 278 |
+
"\ncorrect answer(s): {correct_answers}"
|
| 279 |
+
"\nresponse: {response}"
|
| 280 |
+
"\ncomment: "
|
| 281 |
+
)
|
| 282 |
+
evaluation = evaluation_template.format(
|
| 283 |
+
correct_answers=' | '.join(correct_answers),
|
| 284 |
+
response=response,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
# νκ°
|
| 288 |
+
fresheval_response = self.call_fresheval(
|
| 289 |
+
mode=mode,
|
| 290 |
+
question=question,
|
| 291 |
+
evaluation=evaluation,
|
| 292 |
+
current_date=current_date
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
|
| 296 |
+
|
| 297 |
+
# if is_valid_eval:
|
| 298 |
+
# print('μλ£')
|
| 299 |
+
|
| 300 |
+
# μ¬νκ° νμ μ ν (μ΅λ 5ν)
|
| 301 |
+
max_retries = 5
|
| 302 |
+
retry_count = 0
|
| 303 |
+
|
| 304 |
+
# μ¬μλ loop
|
| 305 |
+
while not is_valid_eval and retry_count < max_retries:
|
| 306 |
+
retry_count += 1
|
| 307 |
+
# print(f'μ ν¨νμ§ μμ νκ°, μ¬νκ° μ€... ({retry_count}/{max_retries})\n response: {fresheval_response}')
|
| 308 |
+
|
| 309 |
+
fresheval_response = self.call_fresheval(
|
| 310 |
+
mode=mode,
|
| 311 |
+
question=question,
|
| 312 |
+
evaluation=evaluation,
|
| 313 |
+
current_date=current_date
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
|
| 317 |
+
# if is_valid_eval:
|
| 318 |
+
# print('μλ£')
|
| 319 |
+
|
| 320 |
+
# μ΅λ μ¬μλ νμ μ΄κ³Ό μ κΈ°λ³Έ κ° μ¬μ©
|
| 321 |
+
if not is_valid_eval:
|
| 322 |
+
print(f'β οΈ μ΅λ μ¬μλ νμ({max_retries}) μ΄κ³Ό. κΈ°λ³Έκ° μ¬μ©: rating=0')
|
| 323 |
+
eval_result = {'rating': 0}
|
| 324 |
+
fresheval_response = "μ¬μλ νμ μ΄κ³Όλ‘ μΈν κΈ°λ³Έ νκ°"
|
| 325 |
+
|
| 326 |
+
row_dict = row.to_dict()
|
| 327 |
+
row_dict['rating'] = eval_result['rating']
|
| 328 |
+
row_dict['explanation'] = fresheval_response
|
| 329 |
+
|
| 330 |
+
# π DEBUG: FALSEμΈ κ²½μ°μλ§ μμΈ μΆλ ₯
|
| 331 |
+
# if eval_result['rating'] == 'FALSE':
|
| 332 |
+
# print(f"\n{'='*80}")
|
| 333 |
+
# print(f"β FALSE νκ°λ μ§λ¬Έ")
|
| 334 |
+
# print(f" Mode: {mode}")
|
| 335 |
+
# print(f" Question: {question}")
|
| 336 |
+
# print(f" Correct Answers: {' | '.join(correct_answers)}")
|
| 337 |
+
# print(f" Model Response: {response}")
|
| 338 |
+
# print(f"\n LLM νκ° μλ΅:")
|
| 339 |
+
# print(f" {fresheval_response}")
|
| 340 |
+
# print(f" μ΅μ’
Rating: {eval_result['rating']}")
|
| 341 |
+
# print(f"{'='*80}\n")
|
| 342 |
+
|
| 343 |
+
return row_dict
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame:
|
| 347 |
+
"""λ°μ΄ν°νλ μ νκ°"""
|
| 348 |
+
|
| 349 |
+
freshevals = []
|
| 350 |
+
current_date = get_current_date_str()
|
| 351 |
+
|
| 352 |
+
len_df = len(df)
|
| 353 |
+
for index, row in df.iterrows():
|
| 354 |
+
print(f'{mode} νκ° μ€... {index+1}/{len_df}')
|
| 355 |
+
row_dict = self.evaluate_single_row(row, mode, current_date)
|
| 356 |
+
freshevals.append(row_dict)
|
| 357 |
+
|
| 358 |
+
return pd.DataFrame(freshevals)
|
freshqa/fresheval_parallel.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
import time
|
| 5 |
+
import queue
|
| 6 |
+
|
| 7 |
+
from freshqa.fresheval import FreshEval
|
| 8 |
+
from src.api_key_rotator import get_rotator
|
| 9 |
+
from src.utils import get_current_date_str
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FreshEvalParallel:
|
| 13 |
+
"""λ³λ ¬ μ²λ¦¬λ₯Ό μν FreshEval λνΌ ν΄λμ€"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, model: str = 'solar-pro2', max_workers: int = 4):
|
| 16 |
+
self.model = model
|
| 17 |
+
self.max_workers = max_workers
|
| 18 |
+
|
| 19 |
+
def evaluate_dataframe(self, df: pd.DataFrame, mode: str, progress_queue: "queue.Queue[int] | None" = None, on_item_done=None) -> pd.DataFrame:
|
| 20 |
+
"""λ³λ ¬ μ²λ¦¬λ₯Ό ν΅ν λ°μ΄ν°νλ μ νκ° (μ§νλ₯ νμ)"""
|
| 21 |
+
current_date = get_current_date_str()
|
| 22 |
+
total_rows = len(df)
|
| 23 |
+
|
| 24 |
+
# print(f"π {mode} λͺ¨λ νκ° μμ: {total_rows}κ° ν, {self.max_workers}κ° μ컀")
|
| 25 |
+
|
| 26 |
+
# μμ»€λ³ μΈμ μ€λΉ
|
| 27 |
+
worker_args = []
|
| 28 |
+
for index, row in df.iterrows():
|
| 29 |
+
worker_args.append((row, mode, current_date))
|
| 30 |
+
|
| 31 |
+
# λ³λ ¬ μ²λ¦¬ (μ§νλ₯ νμ)
|
| 32 |
+
results = [None] * total_rows # 미리 ν¬κΈ° ν λΉ
|
| 33 |
+
completed_count = 0
|
| 34 |
+
start_time = time.time()
|
| 35 |
+
|
| 36 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 37 |
+
# λͺ¨λ μμ
μ μΆ
|
| 38 |
+
future_to_index = {
|
| 39 |
+
executor.submit(self._evaluate_single_row_worker, args): i
|
| 40 |
+
for i, args in enumerate(worker_args)
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# μλ£λ μμ
λ€μ μμλλ‘ μ²λ¦¬
|
| 44 |
+
for future in as_completed(future_to_index):
|
| 45 |
+
original_index = future_to_index[future]
|
| 46 |
+
try:
|
| 47 |
+
result = future.result()
|
| 48 |
+
results[original_index] = result
|
| 49 |
+
completed_count += 1
|
| 50 |
+
|
| 51 |
+
# progress_queueμ μ§νλ₯ λ°μ (Gradio UI μ
λ°μ΄νΈμ©)
|
| 52 |
+
if progress_queue is not None:
|
| 53 |
+
progress_queue.put(1)
|
| 54 |
+
|
| 55 |
+
# on_item_done μ½λ°± νΈμΆ (μ΅μ
)
|
| 56 |
+
if on_item_done:
|
| 57 |
+
on_item_done(original_index, result)
|
| 58 |
+
|
| 59 |
+
# μ§νλ₯ νμ
|
| 60 |
+
progress_percent = (completed_count / total_rows) * 100
|
| 61 |
+
elapsed_time = time.time() - start_time
|
| 62 |
+
|
| 63 |
+
# 10% λ¨μλ‘ νμ (μ΅μ 10κ° λ¨μ 보μ₯)
|
| 64 |
+
# total_rows // 10 = 10%μ ν΄λΉνλ κ°μ (μ: 3000ν β 300κ°)
|
| 65 |
+
# max(1, ...)μΌλ‘ μ΅μ 1κ°λ§λ€ 보μ₯
|
| 66 |
+
print_interval = max(10, total_rows // 10) # μ΅μ 10κ°, 10% λ¨μ
|
| 67 |
+
if (completed_count % print_interval == 0 or
|
| 68 |
+
completed_count == total_rows):
|
| 69 |
+
remaining_time = (elapsed_time / completed_count) * (total_rows - completed_count) if completed_count > 0 else 0
|
| 70 |
+
# print(f"π {mode} λͺ¨λ μ§νλ₯ : {progress_percent:.1f}% ({completed_count}/{total_rows}) - "
|
| 71 |
+
# f"κ²½κ³Ό: {elapsed_time:.1f}μ΄, μμ λ¨μ μκ°: {remaining_time:.1f}μ΄")
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"β νκ° μ€ν¨ (ν {original_index}): {e}")
|
| 76 |
+
# μ€ν¨ν κ²½μ° κΈ°λ³Έκ° λ°ν
|
| 77 |
+
results[original_index] = {
|
| 78 |
+
'rating': 0,
|
| 79 |
+
'explanation': f"νκ° μ€ν¨: {str(e)}"
|
| 80 |
+
}
|
| 81 |
+
completed_count += 1
|
| 82 |
+
|
| 83 |
+
# μ€ν¨ν΄λ νμ μ§νλ₯ λ°μ
|
| 84 |
+
if progress_queue is not None:
|
| 85 |
+
progress_queue.put(1)
|
| 86 |
+
|
| 87 |
+
total_time = time.time() - start_time
|
| 88 |
+
print(f"β
{mode} λͺ¨λ νκ° μλ£: {total_time:.1f}μ΄ μμ")
|
| 89 |
+
|
| 90 |
+
return pd.DataFrame(results)
|
| 91 |
+
|
| 92 |
+
def _evaluate_single_row_worker(self, args: tuple) -> Dict[str, Any]:
|
| 93 |
+
"""μ컀 ν¨μ - κ° μ컀λ§λ€ λ
립μ μΈ FreshEval μΈμ€ν΄μ€ μμ±"""
|
| 94 |
+
row, mode, current_date = args
|
| 95 |
+
|
| 96 |
+
# κ° μ컀λ§λ€ λ
립μ μΈ FreshEval μΈμ€ν΄μ€ μμ± (λ‘ν
μ΄ν°λ‘ ν€ λΆλ°°)
|
| 97 |
+
api_key = get_rotator().pick_key()
|
| 98 |
+
worker_eval = FreshEval(model=self.model, api_key=api_key)
|
| 99 |
+
|
| 100 |
+
# κΈ°μ‘΄ evaluate_single_row λ©μλ μ¬μ©
|
| 101 |
+
return worker_eval.evaluate_single_row(row, mode, current_date)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# νΈμ ν¨μ
|
| 105 |
+
def evaluate_dataframe_parallel(
|
| 106 |
+
df: pd.DataFrame,
|
| 107 |
+
mode: str,
|
| 108 |
+
on_item_done=None,
|
| 109 |
+
progress_queue: "queue.Queue[int] | None" = None,
|
| 110 |
+
max_workers: int = 4) -> pd.DataFrame:
|
| 111 |
+
"""λ³λ ¬ μ²λ¦¬λ₯Ό ν΅ν λ°μ΄ν°νλ μ νκ° (νΈμ ν¨μ)"""
|
| 112 |
+
parallel_eval = FreshEvalParallel(model='solar-pro2', max_workers=max_workers)
|
| 113 |
+
return parallel_eval.evaluate_dataframe(df, mode, progress_queue, on_item_done)
|
freshqa/freshqa_acc.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
FreshQA μ νλ κ³μ° μ€ν¬λ¦½νΈ
|
| 4 |
+
|
| 5 |
+
μ΄ μ€ν¬λ¦½νΈλ FreshQA λ°μ΄ν°μ
μ μ νλλ₯Ό κ³μ°νκ³ λ€μν μΉ΄ν
κ³ λ¦¬λ³λ‘ λΆμν©λλ€.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_freshqa_data(csv_path='freshqa.csv'):
|
| 14 |
+
"""FreshQA CSV νμΌμ λ‘λν©λλ€."""
|
| 15 |
+
try:
|
| 16 |
+
# λ¨Όμ νμΌμ μ½μ΄μ ꡬ쑰λ₯Ό νμΈ
|
| 17 |
+
temp_df = pd.read_csv(csv_path)
|
| 18 |
+
# print(f"νμΌ κ΅¬μ‘° νμΈ: {len(temp_df)}κ° ν, 컬λΌ: {temp_df.columns.tolist()}")
|
| 19 |
+
|
| 20 |
+
# rating 컬λΌμ΄ μμΌλ©΄ κ·Έλλ‘ μ¬μ©, μμΌλ©΄ skiprows μ μ©
|
| 21 |
+
if 'rating' in temp_df.columns:
|
| 22 |
+
fresh_qa = temp_df
|
| 23 |
+
# print("rating 컬λΌμ΄ μλ νμΌλ‘ μΈμνμ¬ μ 체 λ°μ΄ν°λ₯Ό μ¬μ©ν©λλ€.")
|
| 24 |
+
else:
|
| 25 |
+
fresh_qa = pd.read_csv(csv_path, skiprows=[0, 1])
|
| 26 |
+
# print("κΈ°λ³Έ FreshQA νμμΌλ‘ μΈμνμ¬ skiprowsλ₯Ό μ μ©ν©λλ€.")
|
| 27 |
+
|
| 28 |
+
# print(f"FreshQA λ°μ΄ν° λ‘λ μλ£: {len(fresh_qa)}κ° μν")
|
| 29 |
+
return fresh_qa
|
| 30 |
+
except FileNotFoundError:
|
| 31 |
+
print(f"μ€λ₯: {csv_path} νμΌμ μ°Ύμ μ μμ΅λλ€.")
|
| 32 |
+
print("νμ¬ λλ ν 리μ freshqa.csv νμΌμ΄ μλμ§ νμΈν΄μ£ΌμΈμ.")
|
| 33 |
+
sys.exit(1)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"λ°μ΄ν° λ‘λ μ€ μ€λ₯ λ°μ: {e}")
|
| 36 |
+
sys.exit(1)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def process_freshqa_dataframe(df):
|
| 40 |
+
"""DataFrameμ λ§€κ°λ³μλ‘ λ°μ FreshQA λ°μ΄ν°λ₯Ό μ²λ¦¬ν©λλ€."""
|
| 41 |
+
try:
|
| 42 |
+
# print(f"DataFrame ꡬ쑰 νμΈ: {len(df)}κ° ν, 컬λΌ: {df.columns.tolist()}")
|
| 43 |
+
|
| 44 |
+
# rating 컬λΌμ΄ μμΌλ©΄ κ·Έλλ‘ μ¬μ©, μμΌλ©΄ κΈ°λ³Έκ° μ€μ
|
| 45 |
+
if 'rating' in df.columns:
|
| 46 |
+
# print("DataFrameμ rating 컬λΌμ΄ μμ΄μ κ·Έλλ‘ μ¬μ©ν©λλ€.")
|
| 47 |
+
processed_df = df.copy()
|
| 48 |
+
else:
|
| 49 |
+
# print("DataFrameμ rating 컬λΌμ΄ μμ΄μ κΈ°λ³Έκ° 0μΌλ‘ μ€μ ν©λλ€.")
|
| 50 |
+
processed_df = df.copy()
|
| 51 |
+
processed_df['rating'] = 0 # κΈ°λ³Έκ°μΌλ‘ 0 μ€μ
|
| 52 |
+
|
| 53 |
+
print(f"FreshQA λ°μ΄ν° μ²λ¦¬ μλ£: {len(processed_df)}κ° μν")
|
| 54 |
+
return processed_df
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"λ°μ΄ν° μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}")
|
| 57 |
+
raise
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def update_results(df, d_acc, d_count, field_name):
|
| 61 |
+
"""κ²°κ³Ό λμ
λ리λ₯Ό μ
λ°μ΄νΈν©λλ€."""
|
| 62 |
+
if len(df) == 0:
|
| 63 |
+
r = 0.0
|
| 64 |
+
else:
|
| 65 |
+
# ratingμ΄ λ¬Έμμ΄ 'TRUE'μ΄κ±°λ μ«μ 1μΈ κ²½μ°λ₯Ό λͺ¨λ μ²λ¦¬
|
| 66 |
+
if df['rating'].dtype == 'object':
|
| 67 |
+
# λ¬Έμμ΄μΈ κ²½μ° 'TRUE' νμΈ
|
| 68 |
+
r = len(df[df.rating == 'TRUE']) * 100 / len(df)
|
| 69 |
+
else:
|
| 70 |
+
# μ«μμΈ κ²½μ° 1 νμΈ
|
| 71 |
+
r = len(df[df.rating == 1]) * 100 / len(df)
|
| 72 |
+
d_acc[field_name] = r
|
| 73 |
+
d_count[field_name] = len(df)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def calculate_accuracy_simple(fresh_qa):
|
| 77 |
+
"""FreshQA λ°μ΄ν°μ κΈ°λ³Έ μ νλλ₯Ό κ³μ°ν©λλ€ (κ°λ¨ν λ²μ )."""
|
| 78 |
+
print("μ νλ κ³μ° μ€...")
|
| 79 |
+
|
| 80 |
+
# rating 컬λΌμ΄ μμΌλ©΄ κΈ°λ³Έκ° 0μΌλ‘ μ€μ
|
| 81 |
+
if 'rating' not in fresh_qa.columns:
|
| 82 |
+
# print("rating 컬λΌμ΄ μμ΄μ κΈ°λ³Έκ° 0μΌλ‘ μ€μ ν©λλ€.")
|
| 83 |
+
fresh_qa['rating'] = 0
|
| 84 |
+
|
| 85 |
+
accs = {}
|
| 86 |
+
counts = {}
|
| 87 |
+
|
| 88 |
+
# μ 체 μ νλ
|
| 89 |
+
update_results(fresh_qa, accs, counts, 'overall_accuracy')
|
| 90 |
+
|
| 91 |
+
# split 컬λΌμ΄ μμΌλ©΄ λΆν λ³ μ νλ κ³μ°
|
| 92 |
+
if 'split' in fresh_qa.columns:
|
| 93 |
+
fresh_qa_test = fresh_qa[fresh_qa.split == 'TEST']
|
| 94 |
+
fresh_qa_dev = fresh_qa[fresh_qa.split == 'DEV']
|
| 95 |
+
update_results(fresh_qa_test, accs, counts, 'acc_test')
|
| 96 |
+
update_results(fresh_qa_dev, accs, counts, 'acc_dev')
|
| 97 |
+
|
| 98 |
+
# fact_type 컬λΌμ΄ μμΌλ©΄ μ¬μ€ μ νλ³ μ νλ κ³μ°
|
| 99 |
+
if 'fact_type' in fresh_qa.columns:
|
| 100 |
+
for fact_type in ['fast-changing', 'slow-changing', 'never-changing']:
|
| 101 |
+
if fact_type in fresh_qa['fact_type'].values:
|
| 102 |
+
sub_df = fresh_qa[fresh_qa.fact_type == fact_type]
|
| 103 |
+
update_results(sub_df, accs, counts, f'{fact_type}_accuracy')
|
| 104 |
+
|
| 105 |
+
# false_premise 컬λΌμ΄ μμΌλ©΄ False premise μ νλ κ³μ°
|
| 106 |
+
if 'false_premise' in fresh_qa.columns:
|
| 107 |
+
fp_df = fresh_qa[fresh_qa.false_premise == True]
|
| 108 |
+
if len(fp_df) > 0:
|
| 109 |
+
update_results(fp_df, accs, counts, 'false_premise_accuracy')
|
| 110 |
+
|
| 111 |
+
# domain 컬λΌμ΄ μμΌλ©΄ λλ©μΈλ³ μ νλ κ³μ°
|
| 112 |
+
if 'domain' in fresh_qa.columns:
|
| 113 |
+
# νκ΅μ΄ λλ©μΈ μΉ΄ν
κ³ λ¦¬λ€ (μ€μ CSV νμΌμ domain κ°λ€)
|
| 114 |
+
korean_domains = ['μ μΉ', 'μ€ν¬μΈ ', 'μ°μ', 'λ μ¨', 'μΈκ³', 'κ²½μ ', 'μ¬ν', 'IT/κ³Όν', 'μν/λ¬Έν', 'UNK']
|
| 115 |
+
|
| 116 |
+
for domain in korean_domains:
|
| 117 |
+
if domain in fresh_qa['domain'].values:
|
| 118 |
+
domain_df = fresh_qa[fresh_qa.domain == domain]
|
| 119 |
+
domain_test = domain_df[domain_df.split == 'TEST']
|
| 120 |
+
domain_dev = domain_df[domain_df.split == 'DEV']
|
| 121 |
+
|
| 122 |
+
# λλ©μΈλͺ
μ μμ΄λ‘ λ³ν (νμΌλͺ
/ν€μ μ¬μ©)
|
| 123 |
+
domain_key = domain.replace('/', '_').replace(' ', '_').lower()
|
| 124 |
+
if domain == 'IT/κ³Όν':
|
| 125 |
+
domain_key = 'it_science'
|
| 126 |
+
elif domain == 'μν/λ¬Έν':
|
| 127 |
+
domain_key = 'life_culture'
|
| 128 |
+
elif domain == 'UNK':
|
| 129 |
+
domain_key = 'unknown'
|
| 130 |
+
|
| 131 |
+
update_results(domain_df, accs, counts, f'acc_{domain_key}')
|
| 132 |
+
update_results(domain_test, accs, counts, f'acc_test_{domain_key}')
|
| 133 |
+
update_results(domain_dev, accs, counts, f'acc_dev_{domain_key}')
|
| 134 |
+
|
| 135 |
+
# κΈ°μ‘΄ μμ΄ λλ©μΈλ€λ μ μ§ (νΈνμ±μ μν΄)
|
| 136 |
+
english_domains = ['politics', 'sports', 'entertainment', 'weather', 'world', 'economy', 'society', 'it_science', 'life_culture']
|
| 137 |
+
for domain in english_domains:
|
| 138 |
+
if domain in fresh_qa['domain'].values:
|
| 139 |
+
domain_df = fresh_qa[fresh_qa.domain == domain]
|
| 140 |
+
update_results(domain_df, accs, counts, f'{domain}_accuracy')
|
| 141 |
+
|
| 142 |
+
# μ΄ μ§λ¬Έ μ μΆκ°
|
| 143 |
+
accs['total_questions'] = len(fresh_qa)
|
| 144 |
+
|
| 145 |
+
return accs
|
| 146 |
+
|
| 147 |
+
def calculate_accuracy(fresh_qa):
|
| 148 |
+
"""FreshQA λ°μ΄ν°μ μ νλλ₯Ό κ³μ°ν©λλ€."""
|
| 149 |
+
|
| 150 |
+
# λ°μ΄ν° λΆν
|
| 151 |
+
fresh_qa_test = fresh_qa[fresh_qa.split == 'TEST']
|
| 152 |
+
fresh_qa_dev = fresh_qa[fresh_qa.split == 'DEV']
|
| 153 |
+
|
| 154 |
+
accs = {}
|
| 155 |
+
counts = {}
|
| 156 |
+
|
| 157 |
+
# μ 체 μ νλ
|
| 158 |
+
update_results(fresh_qa, accs, counts, 'acc')
|
| 159 |
+
update_results(fresh_qa_test, accs, counts, 'acc_test')
|
| 160 |
+
update_results(fresh_qa_dev, accs, counts, 'acc_dev')
|
| 161 |
+
|
| 162 |
+
# μ¬μ€ μ νλ³ μ νλ
|
| 163 |
+
for fact_type in ['fast-changing', 'slow-changing', 'never-changing']:
|
| 164 |
+
sub_df = fresh_qa[(fresh_qa.false_premise == False) & (fresh_qa.fact_type == fact_type)]
|
| 165 |
+
sub_df_test = sub_df[sub_df.split == 'TEST']
|
| 166 |
+
sub_df_dev = sub_df[sub_df.split == 'DEV']
|
| 167 |
+
|
| 168 |
+
ft = fact_type.replace('-', '_')
|
| 169 |
+
update_results(sub_df, accs, counts, f'acc_{ft}')
|
| 170 |
+
update_results(sub_df_test, accs, counts, f'acc_test_{ft}')
|
| 171 |
+
update_results(sub_df_dev, accs, counts, f'acc_dev_{ft}')
|
| 172 |
+
|
| 173 |
+
# μ§λ¬Έ μ νλ³ μ νλ (vp: valid premise, fp: false premise)
|
| 174 |
+
for qt in ['vp', 'fp']:
|
| 175 |
+
fp = True if qt == 'fp' else False
|
| 176 |
+
data = fresh_qa[(fresh_qa.false_premise == fp)]
|
| 177 |
+
data_test = data[data.split == 'TEST']
|
| 178 |
+
data_dev = data[data.split == 'DEV']
|
| 179 |
+
|
| 180 |
+
# ν μλ³ λΆμ
|
| 181 |
+
data_one_hop = data[data.num_hops == 'one-hop']
|
| 182 |
+
data_one_hop_test = data_one_hop[data_one_hop.split == 'TEST']
|
| 183 |
+
data_one_hop_dev = data_one_hop[data_one_hop.split == 'DEV']
|
| 184 |
+
|
| 185 |
+
data_two_hop = data[data.num_hops == 'multi-hop']
|
| 186 |
+
data_two_hop_test = data_two_hop[data_two_hop.split == 'TEST']
|
| 187 |
+
data_two_hop_dev = data_two_hop[data_two_hop.split == 'DEV']
|
| 188 |
+
|
| 189 |
+
# μ°λλ³ λΆμ
|
| 190 |
+
data_old = data[(data.effective_year != '2022') & (data.effective_year != '2023')]
|
| 191 |
+
data_old_test = data_old[data_old.split == 'TEST']
|
| 192 |
+
data_old_dev = data_old[data_old.split == 'DEV']
|
| 193 |
+
|
| 194 |
+
data_new = data[(data.effective_year == '2022') | (data.effective_year == '2023')]
|
| 195 |
+
data_new_test = data_new[data_new.split == 'TEST']
|
| 196 |
+
data_new_dev = data_new[data_new.split == 'DEV']
|
| 197 |
+
|
| 198 |
+
# κΈ°λ³Έ μ νλ
|
| 199 |
+
update_results(data, accs, counts, f'acc_{qt}')
|
| 200 |
+
update_results(data_test, accs, counts, f'acc_test_{qt}')
|
| 201 |
+
update_results(data_dev, accs, counts, f'acc_dev_{qt}')
|
| 202 |
+
|
| 203 |
+
# ν μλ³ μ νλ
|
| 204 |
+
update_results(data_one_hop, accs, counts, f'acc_{qt}_one_hop')
|
| 205 |
+
update_results(data_one_hop_test, accs, counts, f'acc_test_{qt}_one_hop')
|
| 206 |
+
update_results(data_one_hop_dev, accs, counts, f'acc_dev_{qt}_one_hop')
|
| 207 |
+
|
| 208 |
+
update_results(data_two_hop, accs, counts, f'acc_{qt}_two_hop')
|
| 209 |
+
update_results(data_two_hop_test, accs, counts, f'acc_test_{qt}_two_hop')
|
| 210 |
+
update_results(data_two_hop_dev, accs, counts, f'acc_dev_{qt}_two_hop')
|
| 211 |
+
|
| 212 |
+
# μ°λλ³ μ νλ
|
| 213 |
+
update_results(data_old, accs, counts, f'acc_{qt}_old')
|
| 214 |
+
update_results(data_old_test, accs, counts, f'acc_test_{qt}_old')
|
| 215 |
+
update_results(data_old_dev, accs, counts, f'acc_dev_{qt}_old')
|
| 216 |
+
|
| 217 |
+
update_results(data_new, accs, counts, f'acc_{qt}_new')
|
| 218 |
+
update_results(data_new_test, accs, counts, f'acc_test_{qt}_new')
|
| 219 |
+
update_results(data_new_dev, accs, counts, f'acc_dev_{qt}_new')
|
| 220 |
+
|
| 221 |
+
# λλ©μΈλ³ μ νλ κ³μ°
|
| 222 |
+
if 'domain' in fresh_qa.columns:
|
| 223 |
+
# νκ΅μ΄ λλ©μΈ μΉ΄ν
κ³ λ¦¬λ€ (μ€μ CSV νμΌμ domain κ°λ€)
|
| 224 |
+
korean_domains = ['μ μΉ', 'μ€ν¬μΈ ', 'μ°μ', 'λ μ¨', 'μΈκ³', 'κ²½μ ', 'μ¬ν', 'IT/κ³Όν', 'μν/λ¬Έν', 'UNK']
|
| 225 |
+
# λλ©μΈλͺ
μ μμ΄λ‘ λ³ν (νμΌλͺ
/ν€μ μ¬μ©)
|
| 226 |
+
domain_mapping = {
|
| 227 |
+
'μ μΉ': 'politics',
|
| 228 |
+
'μ€ν¬μΈ ': 'sports',
|
| 229 |
+
'μ°μ': 'entertainment',
|
| 230 |
+
'λ μ¨': 'weather',
|
| 231 |
+
'μΈκ³': 'world',
|
| 232 |
+
'κ²½μ ': 'economy',
|
| 233 |
+
'μ¬ν': 'society',
|
| 234 |
+
'IT/κ³Όν': 'it_science',
|
| 235 |
+
'μν/λ¬Έν': 'life_culture',
|
| 236 |
+
'UNK': 'unknown'
|
| 237 |
+
}
|
| 238 |
+
for domain in korean_domains:
|
| 239 |
+
if domain in fresh_qa['domain'].values:
|
| 240 |
+
|
| 241 |
+
domain_df = fresh_qa[fresh_qa.domain == domain]
|
| 242 |
+
domain_test = domain_df[domain_df.split == 'TEST']
|
| 243 |
+
domain_dev = domain_df[domain_df.split == 'DEV']
|
| 244 |
+
|
| 245 |
+
domain_key = domain_mapping.get(domain, domain.replace('/', '_').replace(' ', '_').lower())
|
| 246 |
+
|
| 247 |
+
update_results(domain_df, accs, counts, f'acc_{domain_key}')
|
| 248 |
+
update_results(domain_test, accs, counts, f'acc_test_{domain_key}')
|
| 249 |
+
update_results(domain_dev, accs, counts, f'acc_dev_{domain_key}')
|
| 250 |
+
|
| 251 |
+
return accs, counts
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def print_results(accs, counts):
|
| 255 |
+
"""κ²°κ³Όλ₯Ό 보기 μ’κ² μΆλ ₯ν©λλ€."""
|
| 256 |
+
print("\n" + "="*80)
|
| 257 |
+
print("FreshQA μ νλ λΆμ κ²°κ³Ό")
|
| 258 |
+
print("="*80)
|
| 259 |
+
|
| 260 |
+
# μ 체 μ νλ
|
| 261 |
+
print(f"\nπ μ 체 μ νλ:")
|
| 262 |
+
print(f" μ 체: {accs['acc']}% ({counts['acc']}κ° μν)")
|
| 263 |
+
print(f" ν
μ€νΈ: {accs['acc_test']}% ({counts['acc_test']}κ° μν)")
|
| 264 |
+
print(f" κ°λ°: {accs['acc_dev']}% ({counts['acc_dev']}κ° μν)")
|
| 265 |
+
|
| 266 |
+
# μ¬μ€ μ νλ³ μ νλ
|
| 267 |
+
print(f"\nπ μ¬μ€ μ νλ³ μ νλ:")
|
| 268 |
+
fact_types = {
|
| 269 |
+
'fast_changing': 'λΉ λ₯΄κ² λ³νλ μ¬μ€',
|
| 270 |
+
'slow_changing': 'μ²μ²ν λ³νλ μ¬μ€',
|
| 271 |
+
'never_changing': 'λ³νμ§ μλ μ¬μ€'
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
for key, name in fact_types.items():
|
| 275 |
+
print(f" {name}:")
|
| 276 |
+
print(f" μ 체: {accs[f'acc_{key}']}% ({counts[f'acc_{key}']}κ° μν)")
|
| 277 |
+
print(f" ν
μ€νΈ: {accs[f'acc_test_{key}']}% ({counts[f'acc_test_{key}']}κ° μν)")
|
| 278 |
+
print(f" κ°λ°: {accs[f'acc_dev_{key}']}% ({counts[f'acc_dev_{key}']}κ° μν)")
|
| 279 |
+
|
| 280 |
+
# μ§λ¬Έ μ νλ³ μ νλ
|
| 281 |
+
print(f"\nβ μ§λ¬Έ μ νλ³ μ νλ:")
|
| 282 |
+
question_types = {
|
| 283 |
+
'vp': 'μ ν¨ν μ μ (Valid Premise)',
|
| 284 |
+
'fp': 'μλͺ»λ μ μ (False Premise)'
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
for key, name in question_types.items():
|
| 288 |
+
print(f" {name}:")
|
| 289 |
+
print(f" μ 체: {accs[f'acc_{key}']}% ({counts[f'acc_{key}']}κ° μν)")
|
| 290 |
+
print(f" ν
μ€νΈ: {accs[f'acc_test_{key}']}% ({counts[f'acc_test_{key}']}κ° μν)")
|
| 291 |
+
print(f" κ°λ°: {accs[f'acc_dev_{key}']}% ({counts[f'acc_dev_{key}']}κ° μν)")
|
| 292 |
+
|
| 293 |
+
# ν μλ³
|
| 294 |
+
print(f" λ¨μΌ ν: {accs[f'acc_{key}_one_hop']}% ({counts[f'acc_{key}_one_hop']}κ° μν)")
|
| 295 |
+
print(f" λ€μ€ ν: {accs[f'acc_{key}_two_hop']}% ({counts[f'acc_{key}_two_hop']}κ° μν)")
|
| 296 |
+
|
| 297 |
+
# μ°λλ³
|
| 298 |
+
print(f" μ€λλ λ°μ΄ν°: {accs[f'acc_{key}_old']}% ({counts[f'acc_{key}_old']}κ° μν)")
|
| 299 |
+
print(f" μ΅μ λ°μ΄ν°: {accs[f'acc_{key}_new']}% ({counts[f'acc_{key}_new']}κ° μν)")
|
| 300 |
+
|
| 301 |
+
# λλ©μΈλ³ μ νλ
|
| 302 |
+
print(f"\nπ λλ©μΈλ³ μ νλ:")
|
| 303 |
+
domain_mapping = {
|
| 304 |
+
'politics': 'μ μΉ',
|
| 305 |
+
'sports': 'μ€ν¬μΈ ',
|
| 306 |
+
'entertainment': 'μ°μ',
|
| 307 |
+
'weather': 'λ μ¨',
|
| 308 |
+
'world': 'μΈκ³',
|
| 309 |
+
'economy': 'κ²½μ ',
|
| 310 |
+
'society': 'μ¬ν',
|
| 311 |
+
'it_science': 'IT/κ³Όν',
|
| 312 |
+
'life_culture': 'μν/λ¬Έν',
|
| 313 |
+
'unknown': 'UNK'
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
for key, name in domain_mapping.items():
|
| 317 |
+
if f'acc_{key}' in accs:
|
| 318 |
+
print(f" {name}:")
|
| 319 |
+
print(f" μ 체: {accs[f'acc_{key}']}% ({counts[f'acc_{key}']}κ° μν)")
|
| 320 |
+
if f'acc_test_{key}' in accs:
|
| 321 |
+
print(f" ν
μ€νΈ: {accs[f'acc_test_{key}']}% ({counts[f'acc_test_{key}']}κ° μν)")
|
| 322 |
+
pass
|
| 323 |
+
if f'acc_dev_{key}' in accs:
|
| 324 |
+
print(f" κ°λ°: {accs[f'acc_dev_{key}']}% ({counts[f'acc_dev_{key}']}κ° μν)")
|
| 325 |
+
pass
|
| 326 |
+
pass
|
| 327 |
+
|
| 328 |
+
print("\n" + "="*80)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def main():
|
| 332 |
+
"""λ©μΈ ν¨μ"""
|
| 333 |
+
print("FreshQA μ νλ κ³μ° μ€ν¬λ¦½νΈ")
|
| 334 |
+
print("="*50)
|
| 335 |
+
|
| 336 |
+
# CSV νμΌ κ²½λ‘ νμΈ
|
| 337 |
+
csv_path = 'freshqa.csv'
|
| 338 |
+
if len(sys.argv) > 1:
|
| 339 |
+
csv_path = sys.argv[1]
|
| 340 |
+
|
| 341 |
+
if not os.path.exists(csv_path):
|
| 342 |
+
print(f"μ€λ₯: {csv_path} νμΌμ μ°Ύμ μ μμ΅λλ€.")
|
| 343 |
+
print("μ¬μ©λ²: python freshqa_acc.py [csv_file_path]")
|
| 344 |
+
sys.exit(1)
|
| 345 |
+
|
| 346 |
+
# λ°μ΄ν° λ‘λ
|
| 347 |
+
fresh_qa = load_freshqa_data(csv_path)
|
| 348 |
+
|
| 349 |
+
# μ νλ κ³μ°
|
| 350 |
+
accs, counts = calculate_accuracy(fresh_qa)
|
| 351 |
+
|
| 352 |
+
# κ²°κ³Ό μΆλ ₯
|
| 353 |
+
print_results(accs, counts)
|
| 354 |
+
|
| 355 |
+
# λμ
λ리 ννλ‘λ μΆλ ₯ (μλ³Έ λ
ΈνΈλΆκ³Ό λμΌ)
|
| 356 |
+
print(f"\nπ λμ
λ리 νν κ²°κ³Ό:")
|
| 357 |
+
print(accs)
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
if __name__ == "__main__":
|
| 361 |
+
main()
|
freshqa/merge_csv_with_model_response.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def merge_dataframe_with_model_response_df(
|
| 8 |
+
base_df: pd.DataFrame,
|
| 9 |
+
model_response_csv_path: str,
|
| 10 |
+
question_column: str = "question",
|
| 11 |
+
model_response_column: str = "model_response"
|
| 12 |
+
) -> pd.DataFrame:
|
| 13 |
+
"""
|
| 14 |
+
κΈ°μ€ DataFrameκ³Ό λͺ¨λΈ μλ΅ CSV νμΌμ questionμ κΈ°μ€μΌλ‘ λ§€μΉνμ¬
|
| 15 |
+
model_response 컬λΌμ μΆκ°ν DataFrameμ λ°νν©λλ€.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
base_df (pd.DataFrame): κΈ°μ€μ΄ λλ DataFrame
|
| 19 |
+
model_response_csv_path (str): model_responseκ° ν¬ν¨λ CSV νμΌ κ²½λ‘
|
| 20 |
+
question_column (str): λ§€μΉμ μ¬μ©ν μ§λ¬Έ 컬λΌλͺ
(κΈ°λ³Έκ°: "question")
|
| 21 |
+
model_response_column (str): λͺ¨λΈ μλ΅ μ»¬λΌλͺ
(κΈ°λ³Έκ°: "model_response")
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
pd.DataFrame: λ³ν©λ DataFrame
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
# DataFrame μ ν¨μ± κ²μ¬
|
| 28 |
+
if base_df is None or base_df.empty:
|
| 29 |
+
raise ValueError("κΈ°μ€ DataFrameμ΄ λΉμ΄μμ΅λλ€.")
|
| 30 |
+
|
| 31 |
+
# νμΌ μ‘΄μ¬ μ¬λΆ νμΈ
|
| 32 |
+
if not os.path.exists(model_response_csv_path):
|
| 33 |
+
raise FileNotFoundError(f"λͺ¨λΈ μλ΅ CSV νμΌμ μ°Ύμ μ μμ΅λλ€: {model_response_csv_path}")
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# λͺ¨λΈ μλ΅ CSV νμΌ μ½κΈ°
|
| 37 |
+
# print(f"λͺ¨λΈ μλ΅ CSV νμΌ μ½λ μ€: {model_response_csv_path}")
|
| 38 |
+
model_df = pd.read_csv(model_response_csv_path)
|
| 39 |
+
|
| 40 |
+
# μ»¬λΌ μ‘΄μ¬ μ¬λΆ νμΈ
|
| 41 |
+
if question_column not in base_df.columns:
|
| 42 |
+
raise ValueError(f"κΈ°μ€ DataFrameμ '{question_column}' 컬λΌμ΄ μμ΅λλ€.")
|
| 43 |
+
|
| 44 |
+
if question_column not in model_df.columns:
|
| 45 |
+
raise ValueError(f"λͺ¨λΈ μλ΅ CSV νμΌμ '{question_column}' 컬λΌμ΄ μμ΅λλ€.")
|
| 46 |
+
|
| 47 |
+
if model_response_column not in model_df.columns:
|
| 48 |
+
raise ValueError(f"λͺ¨λΈ μλ΅ CSV νμΌμ '{model_response_column}' 컬λΌμ΄ μμ΅λλ€.")
|
| 49 |
+
|
| 50 |
+
# print(f"κΈ°μ€ λ°μ΄ν°: {len(base_df)}ν")
|
| 51 |
+
# print(f"λͺ¨λΈ μλ΅ λ°μ΄ν°: {len(model_df)}ν")
|
| 52 |
+
|
| 53 |
+
# μ§λ¬Έ ν
μ€νΈ μ κ·ν (곡백 μ κ±°, μλ¬Έμ λ³ν)
|
| 54 |
+
# print("μ§λ¬Έ ν
μ€νΈ μ κ·ν μ€...")
|
| 55 |
+
base_df_normalized = base_df.copy()
|
| 56 |
+
model_df_normalized = model_df.copy()
|
| 57 |
+
|
| 58 |
+
# μ§λ¬Έ ν
μ€νΈ μ κ·ν
|
| 59 |
+
base_df_normalized['question_normalized'] = base_df[question_column].str.strip().str.replace(r'\s+', ' ', regex=True)
|
| 60 |
+
model_df_normalized['question_normalized'] = model_df[question_column].str.strip().str.replace(r'\s+', ' ', regex=True)
|
| 61 |
+
|
| 62 |
+
# questionμ κΈ°μ€μΌλ‘ λ§€μΉ
|
| 63 |
+
# base_df(κΈ°μ€ λ°μ΄ν°)λ₯Ό κΈ°μ€μΌλ‘ model_df(μ¬μ©μ μ μΆ νμΌ)μ λ³ν©
|
| 64 |
+
# model_dfμμ νμν 컬λΌλ€λ§ μ ν
|
| 65 |
+
model_subset = model_df_normalized[[question_column, model_response_column, 'question_normalized']].copy()
|
| 66 |
+
|
| 67 |
+
# μ κ·νλ μ§λ¬ΈμΌλ‘ λ§€μΉ μλ
|
| 68 |
+
merged_df = base_df_normalized.merge(
|
| 69 |
+
model_subset,
|
| 70 |
+
left_on='question_normalized',
|
| 71 |
+
right_on='question_normalized',
|
| 72 |
+
how='left'
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# split = DEVμΈ μ§λ¬Έμ μ μΈνκ³ TEST μ§λ¬Έλ§ λ¨κΉ
|
| 76 |
+
merged_df = merged_df[merged_df['split'] == 'TEST']
|
| 77 |
+
|
| 78 |
+
# μλ³Έ question μ»¬λΌ λ³΅μ (model_responseκ° μλ κ²½μ°)
|
| 79 |
+
if model_response_column in merged_df.columns:
|
| 80 |
+
# model_responseκ° μλ νλ€μ λν΄ μλ³Έ question μ»¬λΌ μ μ§
|
| 81 |
+
merged_df[question_column] = merged_df[question_column + '_x'].fillna(merged_df[question_column + '_y'])
|
| 82 |
+
# λΆνμν μ»¬λΌ μ κ±°
|
| 83 |
+
merged_df = merged_df.drop(columns=[question_column + '_x', question_column + '_y', 'question_normalized'], errors='ignore')
|
| 84 |
+
|
| 85 |
+
# merge ν question κΈ°μ€ μ€λ³΅ νμΈ λ° μ²λ¦¬
|
| 86 |
+
duplicate_mask = merged_df.duplicated(subset=[question_column], keep=False)
|
| 87 |
+
duplicate_count = duplicate_mask.sum()
|
| 88 |
+
if duplicate_count > 0:
|
| 89 |
+
# print(f"β οΈ κ²½κ³ : merge ν κΈ°μ€ λ°μ΄ν°μ μ€λ³΅ μ§λ¬Έμ΄ {duplicate_count}κ° λ°κ²¬λμμ΅λλ€.")
|
| 90 |
+
duplicate_questions = merged_df[duplicate_mask][question_column].unique()
|
| 91 |
+
# print(f" μ€λ³΅λ μ§λ¬Έ μ: {len(duplicate_questions)}κ°")
|
| 92 |
+
for i, q in enumerate(duplicate_questions):
|
| 93 |
+
dup_rows = merged_df[merged_df[question_column] == q]
|
| 94 |
+
# print(f" {i+1}. μ§λ¬Έ: '{q[:80]}...' ({len(dup_rows)}κ° μ€λ³΅)")
|
| 95 |
+
|
| 96 |
+
# μ€λ³΅ μ κ±°: 첫 λ²μ§Έ νλͺ©λ§ μ μ§
|
| 97 |
+
merged_df = merged_df.drop_duplicates(subset=[question_column], keep='first')
|
| 98 |
+
# print(f" β μ€λ³΅ μ κ±° ν merge λ°μ΄ν°: {len(merged_df)}ν")
|
| 99 |
+
|
| 100 |
+
# λ§€μΉ κ²°κ³Ό νμΈ (λͺ¨λΈ μλ΅μ΄ μλμ§ νμΈ)
|
| 101 |
+
matched_count = merged_df.dropna(subset=[model_response_column]).shape[0]
|
| 102 |
+
total_count = len(merged_df)
|
| 103 |
+
|
| 104 |
+
# print(f"λ§€μΉλ μ§λ¬Έ μ: {matched_count}/{total_count}")
|
| 105 |
+
|
| 106 |
+
if matched_count < total_count:
|
| 107 |
+
# λͺ¨λΈ μλ΅μ΄ μλ μ§λ¬Έλ€ μ°ΎκΈ°
|
| 108 |
+
unmatched_mask = merged_df[model_response_column].isna()
|
| 109 |
+
unmatched_questions = merged_df[unmatched_mask][question_column].tolist()
|
| 110 |
+
# print(f"λͺ¨λΈ μλ΅μ΄ μλ μ§λ¬Έλ€:")
|
| 111 |
+
for i, q in enumerate(unmatched_questions[:5]): # μ²μ 5κ°λ§ μΆλ ₯
|
| 112 |
+
# print(f" {i+1}. {q}")
|
| 113 |
+
pass
|
| 114 |
+
if len(unmatched_questions) > 5:
|
| 115 |
+
# print(f" ... λ° {len(unmatched_questions) - 5}κ° λ")
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
# print(f"β
DataFrame μμ± μλ£!")
|
| 119 |
+
# print(f" - κΈ°μ€ DataFrame: {len(base_df)}ν")
|
| 120 |
+
# print(f" - λͺ¨λΈ μλ΅ νμΌ: {model_response_csv_path}")
|
| 121 |
+
# print(f" - λ§€μΉλ₯ : {matched_count/total_count*100:.1f}%")
|
| 122 |
+
|
| 123 |
+
return merged_df
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"β μ€λ₯ λ°μ: {str(e)}")
|
| 127 |
+
raise
|
| 128 |
+
|
| 129 |
+
def main():
|
| 130 |
+
"""
|
| 131 |
+
λͺ
λ Ήν μΈμλ₯Ό λ°μμ CSV νμΌμ λ³ν©νλ λ©μΈ ν¨μ
|
| 132 |
+
"""
|
| 133 |
+
parser = argparse.ArgumentParser(
|
| 134 |
+
description="κΈ°μ€ CSV νμΌκ³Ό λͺ¨λΈ μλ΅ CSV νμΌμ λ³ν©ν©λλ€.",
|
| 135 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 136 |
+
epilog="""
|
| 137 |
+
μ¬μ© μμ:
|
| 138 |
+
python merge_csv_with_model_response.py base.csv model_response.csv output.csv
|
| 139 |
+
python merge_csv_with_model_response.py base.csv model_response.csv output.csv --question-col question --response-col model_response
|
| 140 |
+
"""
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
parser.add_argument(
|
| 144 |
+
'base_csv',
|
| 145 |
+
help='κΈ°μ€μ΄ λλ CSV νμΌ κ²½λ‘'
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
parser.add_argument(
|
| 149 |
+
'model_response_csv',
|
| 150 |
+
help='λͺ¨λΈ μλ΅μ΄ ν¬ν¨λ CSV νμΌ κ²½λ‘'
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
parser.add_argument(
|
| 154 |
+
'output_csv',
|
| 155 |
+
help='μΆλ ₯ν CSV νμΌ κ²½λ‘'
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
parser.add_argument(
|
| 159 |
+
'--question-col',
|
| 160 |
+
default='question',
|
| 161 |
+
help='λ§€μΉμ μ¬μ©ν μ§λ¬Έ 컬λΌλͺ
(κΈ°λ³Έκ°: question)'
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
parser.add_argument(
|
| 165 |
+
'--response-col',
|
| 166 |
+
default='model_response',
|
| 167 |
+
help='λͺ¨λΈ μλ΅ μ»¬λΌλͺ
(κΈ°λ³Έκ°: model_response)'
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# μΈμ νμ±
|
| 171 |
+
args = parser.parse_args()
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
merge_dataframe_with_model_response_df(
|
| 175 |
+
base_csv_path=args.base_csv,
|
| 176 |
+
model_response_csv_path=args.model_response_csv,
|
| 177 |
+
output_csv_path=args.output_csv,
|
| 178 |
+
question_column=args.question_col,
|
| 179 |
+
model_response_column=args.response_col
|
| 180 |
+
)
|
| 181 |
+
except Exception as e:
|
| 182 |
+
print(f"μ€ν μ€ μ€λ₯ λ°μ: {e}")
|
| 183 |
+
sys.exit(1)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies for Hugging Face Spaces
|
| 2 |
+
gradio>=5.0.0
|
| 3 |
+
huggingface_hub<1.0.0
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
plotly>=5.0.0
|
| 7 |
+
|
| 8 |
+
# API and web processing
|
| 9 |
+
requests>=2.25.0
|
| 10 |
+
httpx>=0.24.0
|
| 11 |
+
|
| 12 |
+
# Date and time handling
|
| 13 |
+
pytz>=2023.3
|
| 14 |
+
python-dateutil>=2.8.0
|
| 15 |
+
|
| 16 |
+
# Data processing
|
| 17 |
+
openpyxl>=3.0.0
|
| 18 |
+
chardet>=5.0.0
|
| 19 |
+
|
| 20 |
+
# Progress and logging
|
| 21 |
+
tqdm>=4.65.0
|
| 22 |
+
|
| 23 |
+
# FreshQA evaluation
|
| 24 |
+
openai>=1.10.0
|
| 25 |
+
tabulate>=0.9.0
|
| 26 |
+
|
| 27 |
+
# Environment variables
|
| 28 |
+
python-dotenv>=1.0.0
|
| 29 |
+
|
| 30 |
+
# Optional: Korean language processing (commented out for faster deployment)
|
| 31 |
+
# konlpy>=0.6.0
|
| 32 |
+
|
| 33 |
+
# Optional: Heavy ML dependencies (commented out for faster deployment)
|
| 34 |
+
# torch>=2.0.0
|
| 35 |
+
# transformers>=4.30.0
|
| 36 |
+
# accelerate>=0.20.0
|
src/api_key_rotator.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API ν€ λ‘ν
μ΄ν° λͺ¨λ
|
| 3 |
+
λ©ν°μ€λ λ© νκ²½μμ μ¬λ¬ API ν€λ₯Ό λΌμ΄λλ‘λΉ λ°©μμΌλ‘ λΆλ°°ν©λλ€.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import threading
|
| 7 |
+
from itertools import count
|
| 8 |
+
from typing import List
|
| 9 |
+
|
| 10 |
+
from config import Config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ApiKeyRotator:
|
| 14 |
+
"""API ν€λ₯Ό λΌμ΄λλ‘λΉ λ°©μμΌλ‘ λΆλ°°νλ ν΄λμ€ (Thread-safe)"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, keys: List[str]):
|
| 17 |
+
"""
|
| 18 |
+
Args:
|
| 19 |
+
keys: μ¬μ©ν API ν€ λ¦¬μ€νΈ (μ΅μ 1κ° μ΄μ)
|
| 20 |
+
"""
|
| 21 |
+
if not keys:
|
| 22 |
+
raise ValueError("API ν€ λ¦¬μ€νΈκ° λΉμ΄μμ΅λλ€. μ΅μ 1κ°μ ν€κ° νμν©λλ€.")
|
| 23 |
+
|
| 24 |
+
self.keys = keys
|
| 25 |
+
self._counter = count() # 무ν μΉ΄μ΄ν°
|
| 26 |
+
self._lock = threading.Lock()
|
| 27 |
+
|
| 28 |
+
def pick_key(self) -> str:
|
| 29 |
+
"""
|
| 30 |
+
λΌμ΄λλ‘λΉ λ°©μμΌλ‘ λ€μ ν€λ₯Ό μ νν©λλ€ (Thread-safe)
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
μ νλ API ν€
|
| 34 |
+
|
| 35 |
+
Example:
|
| 36 |
+
>>> rotator = ApiKeyRotator(["key1", "key2", "key3"])
|
| 37 |
+
>>> rotator.pick_key() # "key1"
|
| 38 |
+
>>> rotator.pick_key() # "key2"
|
| 39 |
+
>>> rotator.pick_key() # "key3"
|
| 40 |
+
>>> rotator.pick_key() # "key1" (μν)
|
| 41 |
+
"""
|
| 42 |
+
with self._lock:
|
| 43 |
+
index = next(self._counter) % len(self.keys)
|
| 44 |
+
return self.keys[index]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# μ μ μΈμ€ν΄μ€ (μ±κΈν€ ν¨ν΄)
|
| 48 |
+
_rotator_instance: ApiKeyRotator = None
|
| 49 |
+
_instance_lock = threading.Lock()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def get_rotator() -> ApiKeyRotator:
|
| 53 |
+
"""
|
| 54 |
+
μ μ ApiKeyRotator μΈμ€ν΄μ€λ₯Ό λ°νν©λλ€ (Lazy initialization)
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
ApiKeyRotator μΈμ€ν΄μ€
|
| 58 |
+
|
| 59 |
+
Note:
|
| 60 |
+
- μ΅μ΄ νΈμΆ μ Config.UPSTAGE_API_KEYSλ₯Ό μ¬μ©νμ¬ μ΄κΈ°ν
|
| 61 |
+
- μ΄ν νΈμΆμ λμΌν μΈμ€ν΄μ€λ₯Ό λ°ν
|
| 62 |
+
"""
|
| 63 |
+
global _rotator_instance
|
| 64 |
+
|
| 65 |
+
# Double-checked locking ν¨ν΄
|
| 66 |
+
if _rotator_instance is None:
|
| 67 |
+
with _instance_lock:
|
| 68 |
+
if _rotator_instance is None:
|
| 69 |
+
keys = Config.UPSTAGE_API_KEYS
|
| 70 |
+
if not keys:
|
| 71 |
+
raise ValueError(
|
| 72 |
+
"UPSTAGE_API_KEY λλ UPSTAGE_API_KEYS νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€. "
|
| 73 |
+
"μ΅μ 1κ°μ API ν€κ° νμν©λλ€."
|
| 74 |
+
)
|
| 75 |
+
_rotator_instance = ApiKeyRotator(keys)
|
| 76 |
+
|
| 77 |
+
return _rotator_instance
|
| 78 |
+
|
src/hf_private_csv_loader.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hugging Face Private Repository CSV νμΌ λ‘λ
|
| 3 |
+
HF_TOKENμ μ΄μ©νμ¬ private repositoryμμ CSV νμΌμ μμ νκ² λΆλ¬μ€λ λͺ¨λ
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import tempfile
|
| 9 |
+
from typing import Optional, Dict, Any, Union
|
| 10 |
+
from huggingface_hub import hf_hub_download, login, whoami
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HFPrivateCSVLoader:
|
| 14 |
+
"""Hugging Face Private Repositoryμμ CSV νμΌμ λ‘λνλ ν΄λμ€"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, token: Optional[str] = None):
|
| 17 |
+
"""
|
| 18 |
+
Args:
|
| 19 |
+
token: Hugging Face API ν ν°. Noneμ΄λ©΄ νκ²½λ³μμμ μλμΌλ‘ κ°μ Έμ΄
|
| 20 |
+
"""
|
| 21 |
+
self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACE_HUB_TOKEN')
|
| 22 |
+
|
| 23 |
+
if not self.token:
|
| 24 |
+
raise ValueError(
|
| 25 |
+
"Hugging Face ν ν°μ΄ νμν©λλ€. "
|
| 26 |
+
"ν ν°μ μ§μ μ λ¬νκ±°λ HF_TOKEN λλ HUGGINGFACE_HUB_TOKEN νκ²½λ³μλ₯Ό μ€μ νμΈμ."
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# ν ν°μΌλ‘ λ‘κ·ΈμΈ
|
| 30 |
+
try:
|
| 31 |
+
login(token=self.token)
|
| 32 |
+
print("β
Hugging Faceμ μ±κ³΅μ μΌλ‘ λ‘κ·ΈμΈλμμ΅λλ€.")
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"β Hugging Face λ‘κ·ΈμΈ μ€ν¨: {e}")
|
| 35 |
+
raise
|
| 36 |
+
|
| 37 |
+
def check_auth(self) -> Dict[str, Any]:
|
| 38 |
+
"""νμ¬ μΈμ¦ μν νμΈ"""
|
| 39 |
+
try:
|
| 40 |
+
user_info = whoami()
|
| 41 |
+
return {
|
| 42 |
+
"authenticated": True,
|
| 43 |
+
"user": user_info.get("name", "Unknown"),
|
| 44 |
+
"type": user_info.get("type", "Unknown"),
|
| 45 |
+
"id": user_info.get("id", "Unknown")
|
| 46 |
+
}
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return {
|
| 49 |
+
"authenticated": False,
|
| 50 |
+
"error": str(e)
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
def load_csv_from_private_repo(self,
|
| 54 |
+
repo_id: str,
|
| 55 |
+
filename: str,
|
| 56 |
+
repo_type: str = "dataset",
|
| 57 |
+
**kwargs) -> Optional[pd.DataFrame]:
|
| 58 |
+
"""
|
| 59 |
+
Private repositoryμμ CSV νμΌμ μ§μ λ‘λν©λλ€.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
repo_id: Repository ID (μ: "username/repo-name")
|
| 63 |
+
filename: CSV νμΌλͺ
(κ²½λ‘ ν¬ν¨ κ°λ₯)
|
| 64 |
+
repo_type: Repository νμ
("dataset", "model", "space")
|
| 65 |
+
**kwargs: pandas.read_csv()μ μ λ¬ν μΆκ° μΈμλ€
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
pandas DataFrame λλ None (μ€ν¨μ)
|
| 69 |
+
"""
|
| 70 |
+
try:
|
| 71 |
+
print(f"π₯ Private repositoryμμ CSV νμΌ λ‘λ μμ: {repo_id}/{filename}")
|
| 72 |
+
|
| 73 |
+
# μμ λλ ν 리μ νμΌ λ€μ΄λ‘λ
|
| 74 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 75 |
+
file_path = hf_hub_download(
|
| 76 |
+
repo_id=repo_id,
|
| 77 |
+
filename=filename,
|
| 78 |
+
local_dir=temp_dir,
|
| 79 |
+
repo_type=repo_type,
|
| 80 |
+
token=self.token # ν ν° λͺ
μμ μ λ¬
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# print(f"β
νμΌ λ€μ΄λ‘λ μλ£: {file_path}")
|
| 84 |
+
|
| 85 |
+
# CSV νμΌ λ‘λ (κΈ°λ³Έ μ€μ + μ¬μ©μ μ§μ μ€μ )
|
| 86 |
+
default_kwargs = {
|
| 87 |
+
'encoding': 'utf-8',
|
| 88 |
+
'low_memory': False
|
| 89 |
+
}
|
| 90 |
+
default_kwargs.update(kwargs)
|
| 91 |
+
|
| 92 |
+
df = pd.read_csv(file_path, **default_kwargs)
|
| 93 |
+
|
| 94 |
+
# print(f"β
CSV νμΌ λ‘λ μλ£: {filename} ({len(df)} ν, {len(df.columns)} μ΄)")
|
| 95 |
+
return df
|
| 96 |
+
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f"β CSV νμΌ λ‘λ μ€ν¨: {e}")
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
def load_multiple_csvs(self,
|
| 102 |
+
repo_id: str,
|
| 103 |
+
filenames: list,
|
| 104 |
+
repo_type: str = "dataset",
|
| 105 |
+
**kwargs) -> Dict[str, Optional[pd.DataFrame]]:
|
| 106 |
+
"""
|
| 107 |
+
μ¬λ¬ CSV νμΌμ ν λ²μ λ‘λν©λλ€.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
repo_id: Repository ID
|
| 111 |
+
filenames: CSV νμΌλͺ
리μ€νΈ
|
| 112 |
+
repo_type: Repository νμ
|
| 113 |
+
**kwargs: pandas.read_csv()μ μ λ¬ν μΆκ° μΈμλ€
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
{filename: DataFrame} λμ
λ리
|
| 117 |
+
"""
|
| 118 |
+
results = {}
|
| 119 |
+
|
| 120 |
+
for filename in filenames:
|
| 121 |
+
# print(f"π₯ {filename} λ‘λ μ€...")
|
| 122 |
+
df = self.load_csv_from_private_repo(repo_id, filename, repo_type, **kwargs)
|
| 123 |
+
results[filename] = df
|
| 124 |
+
|
| 125 |
+
if df is not None:
|
| 126 |
+
# print(f"β
{filename} λ‘λ μ±κ³΅")
|
| 127 |
+
pass
|
| 128 |
+
else:
|
| 129 |
+
print(f"β οΈ {filename} λ‘λ μ€ν¨")
|
| 130 |
+
|
| 131 |
+
return results
|
| 132 |
+
|
| 133 |
+
def get_csv_info(self,
|
| 134 |
+
repo_id: str,
|
| 135 |
+
filename: str,
|
| 136 |
+
repo_type: str = "dataset") -> Optional[Dict[str, Any]]:
|
| 137 |
+
"""
|
| 138 |
+
CSV νμΌμ κΈ°λ³Έ μ 보λ₯Ό λ°νν©λλ€ (μ€μ λ‘λ μμ΄).
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
repo_id: Repository ID
|
| 142 |
+
filename: CSV νμΌλͺ
|
| 143 |
+
repo_type: Repository νμ
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
νμΌ μ 보 λμ
λ리 λλ None
|
| 147 |
+
"""
|
| 148 |
+
try:
|
| 149 |
+
# μμλ‘ νμΌμ λ‘λνμ¬ μ λ³΄λ§ νμΈ
|
| 150 |
+
df = self.load_csv_from_private_repo(repo_id, filename, repo_type)
|
| 151 |
+
|
| 152 |
+
if df is not None:
|
| 153 |
+
return {
|
| 154 |
+
"filename": filename,
|
| 155 |
+
"rows": len(df),
|
| 156 |
+
"columns": len(df.columns),
|
| 157 |
+
"column_names": df.columns.tolist(),
|
| 158 |
+
"dtypes": df.dtypes.to_dict(),
|
| 159 |
+
"memory_usage": df.memory_usage(deep=True).sum(),
|
| 160 |
+
"has_nulls": df.isnull().any().any(),
|
| 161 |
+
"null_counts": df.isnull().sum().to_dict()
|
| 162 |
+
}
|
| 163 |
+
else:
|
| 164 |
+
return None
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"β CSV νμΌ μ 보 μ‘°ν μ€ν¨: {e}")
|
| 168 |
+
return None
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def load_csv_with_token(repo_id: str,
|
| 172 |
+
filename: str,
|
| 173 |
+
token: str,
|
| 174 |
+
repo_type: str = "dataset",
|
| 175 |
+
**kwargs) -> Optional[pd.DataFrame]:
|
| 176 |
+
"""
|
| 177 |
+
νΈμ ν¨μ: ν ν°μ μ§μ μ λ¬νμ¬ CSV νμΌμ λ‘λν©λλ€.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
repo_id: Repository ID
|
| 181 |
+
filename: CSV νμΌλͺ
|
| 182 |
+
token: Hugging Face API ν ν°
|
| 183 |
+
repo_type: Repository νμ
|
| 184 |
+
**kwargs: pandas.read_csv()μ μ λ¬ν μΆκ° μΈμλ€
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
pandas DataFrame λλ None
|
| 188 |
+
"""
|
| 189 |
+
try:
|
| 190 |
+
loader = HFPrivateCSVLoader(token=token)
|
| 191 |
+
return loader.load_csv_from_private_repo(repo_id, filename, repo_type, **kwargs)
|
| 192 |
+
except Exception as e:
|
| 193 |
+
print(f"β CSV λ‘λ μ€ν¨: {e}")
|
| 194 |
+
return None
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def load_csv_with_env_token(repo_id: str,
|
| 198 |
+
filename: str,
|
| 199 |
+
repo_type: str = "dataset",
|
| 200 |
+
**kwargs) -> Optional[pd.DataFrame]:
|
| 201 |
+
"""
|
| 202 |
+
νΈμ ν¨μ: νκ²½λ³μμ ν ν°μ μ¬μ©νμ¬ CSV νμΌμ λ‘λν©λλ€.
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
repo_id: Repository ID
|
| 206 |
+
filename: CSV νμΌλͺ
|
| 207 |
+
repo_type: Repository νμ
|
| 208 |
+
**kwargs: pandas.read_csv()μ μ λ¬ν μΆκ° μΈμλ€
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
pandas DataFrame λλ None
|
| 212 |
+
"""
|
| 213 |
+
try:
|
| 214 |
+
loader = HFPrivateCSVLoader() # νκ²½λ³μμμ ν ν° μλ λ‘λ
|
| 215 |
+
return loader.load_csv_from_private_repo(repo_id, filename, repo_type, **kwargs)
|
| 216 |
+
except Exception as e:
|
| 217 |
+
print(f"β CSV λ‘λ μ€ν¨: {e}")
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# μ¬μ© μμ
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
# μμ 1: ν ν°μ μ§μ μ λ¬
|
| 224 |
+
print("=== μμ 1: ν ν° μ§μ μ λ¬ ===")
|
| 225 |
+
token = "your_hf_token_here" # μ€μ ν ν°μΌλ‘ λ³κ²½
|
| 226 |
+
|
| 227 |
+
try:
|
| 228 |
+
df = load_csv_with_token(
|
| 229 |
+
repo_id="username/private-dataset",
|
| 230 |
+
filename="data.csv",
|
| 231 |
+
token=token,
|
| 232 |
+
repo_type="dataset"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
if df is not None:
|
| 236 |
+
print(f"β
CSV λ‘λ μ±κ³΅: {len(df)} ν, {len(df.columns)} μ΄")
|
| 237 |
+
print(f"컬λΌ: {list(df.columns)}")
|
| 238 |
+
else:
|
| 239 |
+
print("β CSV λ‘λ μ€ν¨")
|
| 240 |
+
except Exception as e:
|
| 241 |
+
print(f"β μ€λ₯: {e}")
|
| 242 |
+
|
| 243 |
+
# μμ 2: νκ²½λ³μ μ¬μ©
|
| 244 |
+
print("\n=== μμ 2: νκ²½λ³μ μ¬μ© ===")
|
| 245 |
+
try:
|
| 246 |
+
df = load_csv_with_env_token(
|
| 247 |
+
repo_id="username/private-dataset",
|
| 248 |
+
filename="data.csv",
|
| 249 |
+
repo_type="dataset"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
if df is not None:
|
| 253 |
+
print(f"β
CSV λ‘λ μ±κ³΅: {len(df)} ν, {len(df.columns)} μ΄")
|
| 254 |
+
else:
|
| 255 |
+
print("β CSV λ‘λ μ€ν¨")
|
| 256 |
+
except Exception as e:
|
| 257 |
+
print(f"β μ€λ₯: {e}")
|
| 258 |
+
|
| 259 |
+
# μμ 3: ν΄λμ€ μ¬μ©
|
| 260 |
+
print("\n=== μμ 3: ν΄λμ€ μ¬μ© ===")
|
| 261 |
+
try:
|
| 262 |
+
loader = HFPrivateCSVLoader(token=token)
|
| 263 |
+
|
| 264 |
+
# μΈμ¦ μν νμΈ
|
| 265 |
+
auth_status = loader.check_auth()
|
| 266 |
+
print(f"μΈμ¦ μν: {auth_status}")
|
| 267 |
+
|
| 268 |
+
# CSV νμΌ μ 보 νμΈ
|
| 269 |
+
csv_info = loader.get_csv_info("username/private-dataset", "data.csv")
|
| 270 |
+
if csv_info:
|
| 271 |
+
print(f"CSV νμΌ μ 보: {csv_info}")
|
| 272 |
+
|
| 273 |
+
# CSV νμΌ λ‘λ
|
| 274 |
+
df = loader.load_csv_from_private_repo("username/private-dataset", "data.csv")
|
| 275 |
+
if df is not None:
|
| 276 |
+
print(f"β
CSV λ‘λ μ±κ³΅: {len(df)} ν, {len(df.columns)} μ΄")
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
print(f"β μ€λ₯: {e}")
|
src/leaderboard_manager.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
리λ보λ κ΄λ¦¬ λͺ¨λ
|
| 3 |
+
리λ보λ λ°μ΄ν°μ λ‘λ, μ μ₯, νμ μ€λΉλ₯Ό λ΄λΉν©λλ€.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import os
|
| 8 |
+
from src.utils import file_lock
|
| 9 |
+
|
| 10 |
+
def load_leaderboard_data():
|
| 11 |
+
"""리λ보λ λ°μ΄ν° λ‘λ"""
|
| 12 |
+
try:
|
| 13 |
+
# νλ‘μ νΈ λ£¨νΈμμ data λλ ν 리 μ°ΎκΈ°
|
| 14 |
+
current_dir = os.path.dirname(os.path.abspath(__file__)) # src/ ν΄λ
|
| 15 |
+
project_root = os.path.dirname(current_dir) # νλ‘μ νΈ λ£¨νΈ
|
| 16 |
+
data_path = os.path.join(project_root, 'data', 'leaderboard_results.csv')
|
| 17 |
+
df = pd.read_csv(data_path)
|
| 18 |
+
|
| 19 |
+
# κΈ°μ‘΄ λ°μ΄ν°μ evaluation_mode 컬λΌμ΄ μμΌλ©΄ μΆκ°
|
| 20 |
+
if 'evaluation_mode' not in df.columns:
|
| 21 |
+
df['evaluation_mode'] = 'Unknown'
|
| 22 |
+
|
| 23 |
+
text_columns = ['model', 'description']
|
| 24 |
+
for col in text_columns:
|
| 25 |
+
if col not in df.columns:
|
| 26 |
+
df[col] = pd.Series(dtype='object')
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# μλ‘μ΄ μμΈ λΆμ 컬λΌλ€μ΄ μμΌλ©΄ μΆκ°
|
| 30 |
+
detailed_columns = [
|
| 31 |
+
'acc_test', 'acc_dev', 'acc_vp', 'acc_fp', 'acc_vp_one_hop', 'acc_vp_two_hop',
|
| 32 |
+
'acc_fp_one_hop', 'acc_fp_two_hop', 'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new'
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
for col in detailed_columns:
|
| 36 |
+
if col not in df.columns:
|
| 37 |
+
df[col] = 0.0
|
| 38 |
+
|
| 39 |
+
# λλ©μΈλ³ μ νλ 컬λΌλ€μ΄ μμΌλ©΄ μΆκ° (freshqa_acc.pyμ μΌμΉ)
|
| 40 |
+
domain_columns = [
|
| 41 |
+
'acc_politics', 'acc_sports', 'acc_entertainment',
|
| 42 |
+
'acc_weather', 'acc_world', 'acc_economy',
|
| 43 |
+
'acc_society', 'acc_it_science', 'acc_life_culture', 'acc_unknown'
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
for col in domain_columns:
|
| 47 |
+
if col not in df.columns:
|
| 48 |
+
df[col] = 0.0
|
| 49 |
+
|
| 50 |
+
# accuracy κΈ°μ€μΌλ‘ μ λ ¬ (λνΉ κΈ°μ€) - λΉ λ°μ΄ν°νλ μμ΄ μλ λλ§
|
| 51 |
+
if not df.empty and 'accuracy' in df.columns:
|
| 52 |
+
df = df.sort_values('accuracy', ascending=False).reset_index(drop=True)
|
| 53 |
+
|
| 54 |
+
# rank 컬λΌμ μ μ₯νμ§ μκ³ νμ μμλ§ κ³μ°
|
| 55 |
+
# μ«μ 컬λΌλ€μ μλ³Έ κ·Έλλ‘ μ μ₯ (λ°μ¬λ¦Όνμ§ μμ)
|
| 56 |
+
|
| 57 |
+
# μ»¬λΌ μμλ₯Ό ν€λμ λ§μΆ°μ μ λ ¬ (rank μ μΈ)
|
| 58 |
+
column_order = [
|
| 59 |
+
'id', 'model', 'description', 'accuracy', 'fast_changing_accuracy',
|
| 60 |
+
'slow_changing_accuracy', 'never_changing_accuracy', 'acc_vp', 'acc_fp',
|
| 61 |
+
'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
|
| 62 |
+
'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
|
| 63 |
+
'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
|
| 64 |
+
'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
|
| 65 |
+
'acc_life_culture', 'acc_unknown', 'total_questions', 'evaluation_date', 'evaluation_mode'
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
# μ‘΄μ¬νλ 컬λΌλ§ μ ννμ¬ μμλλ‘ μ λ ¬
|
| 69 |
+
available_columns = [col for col in column_order if col in df.columns]
|
| 70 |
+
df = df[available_columns]
|
| 71 |
+
|
| 72 |
+
return df
|
| 73 |
+
except FileNotFoundError:
|
| 74 |
+
# μ΄κΈ° λ°μ΄ν° (rank μ μΈ)
|
| 75 |
+
return pd.DataFrame({
|
| 76 |
+
'id': [],
|
| 77 |
+
'model': [],
|
| 78 |
+
'description': [],
|
| 79 |
+
'accuracy': [],
|
| 80 |
+
'fast_changing_accuracy': [],
|
| 81 |
+
'slow_changing_accuracy': [],
|
| 82 |
+
'never_changing_accuracy': [],
|
| 83 |
+
'acc_vp': [],
|
| 84 |
+
'acc_fp': [],
|
| 85 |
+
'acc_vp_one_hop': [],
|
| 86 |
+
'acc_vp_two_hop': [],
|
| 87 |
+
'acc_fp_one_hop': [],
|
| 88 |
+
'acc_fp_two_hop': [],
|
| 89 |
+
'acc_vp_old': [],
|
| 90 |
+
'acc_vp_new': [],
|
| 91 |
+
'acc_fp_old': [],
|
| 92 |
+
'acc_fp_new': [],
|
| 93 |
+
'acc_politics': [],
|
| 94 |
+
'acc_sports': [],
|
| 95 |
+
'acc_entertainment': [],
|
| 96 |
+
'acc_weather': [],
|
| 97 |
+
'acc_world': [],
|
| 98 |
+
'acc_economy': [],
|
| 99 |
+
'acc_society': [],
|
| 100 |
+
'acc_it_science': [],
|
| 101 |
+
'acc_life_culture': [],
|
| 102 |
+
'acc_unknown': [],
|
| 103 |
+
'total_questions': [],
|
| 104 |
+
'evaluation_date': [],
|
| 105 |
+
'evaluation_mode': []
|
| 106 |
+
})
|
| 107 |
+
|
| 108 |
+
def append_to_leaderboard_data(new_data_list):
|
| 109 |
+
"""리λ보λ λ°μ΄ν°μ μλ‘μ΄ κ²°κ³Ό μΆκ° (νμΌ μ κΈ μ¬μ©)"""
|
| 110 |
+
current_dir = os.path.dirname(os.path.abspath(__file__)) # src/ ν΄λ
|
| 111 |
+
project_root = os.path.dirname(current_dir) # νλ‘μ νΈ λ£¨νΈ
|
| 112 |
+
data_path = os.path.join(project_root, 'data', 'leaderboard_results.csv')
|
| 113 |
+
|
| 114 |
+
# νμΌ μ κΈμ μ¬μ©νμ¬ μμ νκ² μ½κΈ° -> μμ -> μ°κΈ°
|
| 115 |
+
with file_lock(data_path + '.lock'):
|
| 116 |
+
# νμΌμ΄ μ‘΄μ¬νλ©΄ μ½κΈ°
|
| 117 |
+
if os.path.exists(data_path):
|
| 118 |
+
existing_df = pd.read_csv(data_path)
|
| 119 |
+
for col in ['model', 'description']:
|
| 120 |
+
if col not in existing_df.columns:
|
| 121 |
+
existing_df[col] = pd.Series(dtype='object')
|
| 122 |
+
else:
|
| 123 |
+
# νμΌμ΄ μμΌλ©΄ λΉ DataFrame μμ±
|
| 124 |
+
existing_df = load_leaderboard_data() # μ΄κΈ° μ€ν€λ§ λ°ν
|
| 125 |
+
|
| 126 |
+
# μλ‘μ΄ λ°μ΄ν° μΆκ°
|
| 127 |
+
new_df = pd.DataFrame(new_data_list)
|
| 128 |
+
|
| 129 |
+
combined_df = pd.concat([existing_df, new_df], ignore_index=True)
|
| 130 |
+
|
| 131 |
+
# μ λ ¬ (accuracy κΈ°μ€)
|
| 132 |
+
if not combined_df.empty and 'accuracy' in combined_df.columns:
|
| 133 |
+
combined_df = combined_df.sort_values('accuracy', ascending=False).reset_index(drop=True)
|
| 134 |
+
|
| 135 |
+
desired_order = [
|
| 136 |
+
'id', 'model', 'description', 'accuracy', 'fast_changing_accuracy',
|
| 137 |
+
'slow_changing_accuracy', 'never_changing_accuracy', 'acc_vp', 'acc_fp',
|
| 138 |
+
'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
|
| 139 |
+
'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
|
| 140 |
+
'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
|
| 141 |
+
'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
|
| 142 |
+
'acc_life_culture', 'acc_unknown', 'total_questions', 'evaluation_date', 'evaluation_mode'
|
| 143 |
+
]
|
| 144 |
+
combined_df = combined_df.reindex(columns=[col for col in desired_order if col in combined_df.columns])
|
| 145 |
+
|
| 146 |
+
# μ μ₯
|
| 147 |
+
combined_df.to_csv(data_path, index=False)
|
| 148 |
+
|
| 149 |
+
return combined_df
|
| 150 |
+
|
| 151 |
+
def prepare_display_data(df, global_ranking=None):
|
| 152 |
+
"""ν
μ΄λΈ νμμ© λ°μ΄ν° μ€λΉ (rank κ³μ° λ° λ°μ¬λ¦Ό μ μ©)"""
|
| 153 |
+
# λΉ λ°μ΄ν°νλ μμΈ κ²½μ° κ·Έλλ‘ λ°ν
|
| 154 |
+
if df.empty:
|
| 155 |
+
return df
|
| 156 |
+
|
| 157 |
+
display_df = df.copy()
|
| 158 |
+
if 'model' in display_df.columns:
|
| 159 |
+
display_df['model'] = display_df['model'].fillna('Anonymous Model')
|
| 160 |
+
display_df['model'] = display_df['model'].replace('', 'Anonymous Model')
|
| 161 |
+
if 'description' in display_df.columns:
|
| 162 |
+
display_df['description'] = display_df['description'].replace({None: '', pd.NA: ''}).fillna('')
|
| 163 |
+
|
| 164 |
+
# rank μ»¬λΌ μΆκ°
|
| 165 |
+
if 'accuracy' in display_df.columns:
|
| 166 |
+
if global_ranking is not None:
|
| 167 |
+
# μ 체 λνΉ μ λ³΄κ° μ 곡λ κ²½μ° μ¬μ©
|
| 168 |
+
display_df['rank'] = display_df.index.map(global_ranking)
|
| 169 |
+
else:
|
| 170 |
+
# μ 체 λνΉ μ λ³΄κ° μλ κ²½μ° accuracy κΈ°μ€μΌλ‘ μ λ ¬νμ¬ rank κ³μ°
|
| 171 |
+
display_df = display_df.sort_values('accuracy', ascending=False).reset_index(drop=True)
|
| 172 |
+
|
| 173 |
+
# rank μ»¬λΌ μΆκ° (1~3μλ μμ΄μ½, λλ¨Έμ§λ μ«μ)
|
| 174 |
+
def get_rank_display(rank):
|
| 175 |
+
if rank == 1:
|
| 176 |
+
return "π₯"
|
| 177 |
+
elif rank == 2:
|
| 178 |
+
return "π₯"
|
| 179 |
+
elif rank == 3:
|
| 180 |
+
return "π₯"
|
| 181 |
+
else:
|
| 182 |
+
return str(rank)
|
| 183 |
+
|
| 184 |
+
display_df['rank'] = [get_rank_display(i+1) for i in range(len(display_df))]
|
| 185 |
+
|
| 186 |
+
# μ«μ 컬λΌλ€μ μμ«μ 2λ²μ§Έμμ λ°μ¬λ¦Ό (νμμ©μΌλ‘λ§)
|
| 187 |
+
numeric_columns = [
|
| 188 |
+
'accuracy', 'fast_changing_accuracy', 'slow_changing_accuracy', 'never_changing_accuracy',
|
| 189 |
+
'acc_vp', 'acc_fp', 'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
|
| 190 |
+
'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
|
| 191 |
+
'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
|
| 192 |
+
'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
|
| 193 |
+
'acc_life_culture', 'acc_unknown'
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
for col in numeric_columns:
|
| 197 |
+
if col in display_df.columns:
|
| 198 |
+
display_df[col] = display_df[col].round(2)
|
| 199 |
+
|
| 200 |
+
# μ»¬λΌ μμ μ¬μ λ ¬ (rankλ₯Ό 맨 μμ)
|
| 201 |
+
column_order = [
|
| 202 |
+
'rank', 'id', 'model', 'description', 'accuracy', 'fast_changing_accuracy',
|
| 203 |
+
'slow_changing_accuracy', 'never_changing_accuracy', 'acc_vp', 'acc_fp',
|
| 204 |
+
'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
|
| 205 |
+
'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
|
| 206 |
+
'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
|
| 207 |
+
'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
|
| 208 |
+
'acc_life_culture', 'acc_unknown', 'total_questions', 'evaluation_date', 'evaluation_mode'
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
# μ‘΄μ¬νλ 컬λΌλ§ μ ννμ¬ μμλλ‘ μ λ ¬
|
| 212 |
+
available_columns = [col for col in column_order if col in display_df.columns]
|
| 213 |
+
display_df = display_df[available_columns]
|
| 214 |
+
|
| 215 |
+
return display_df
|
src/quick_csv_loader.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
λΉ λ₯Έ CSV λ‘λ - κ°λ¨ν μ¬μ©μ μν νΈμ ν¨μλ€
|
| 3 |
+
HF_TOKENμ μ΄μ©νμ¬ private repositoryμμ CSV νμΌμ λΉ λ₯΄κ² λ‘λν©λλ€.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from src.hf_private_csv_loader import HFPrivateCSVLoader
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def quick_load_csv(repo_id: str, filename: str, token: str = None) -> pd.DataFrame:
|
| 12 |
+
"""
|
| 13 |
+
κ°μ₯ κ°λ¨ν λ°©λ²μΌλ‘ CSV νμΌμ λ‘λν©λλ€.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
repo_id: Repository ID (μ: "username/repo-name")
|
| 17 |
+
filename: CSV νμΌλͺ
|
| 18 |
+
token: Hugging Face ν ν° (Noneμ΄λ©΄ νκ²½λ³μμμ μλ λ‘λ)
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
pandas DataFrame
|
| 22 |
+
|
| 23 |
+
Raises:
|
| 24 |
+
Exception: λ‘λ μ€ν¨μ
|
| 25 |
+
"""
|
| 26 |
+
loader = HFPrivateCSVLoader(token=token)
|
| 27 |
+
df = loader.load_csv_from_private_repo(repo_id, filename)
|
| 28 |
+
|
| 29 |
+
if df is None:
|
| 30 |
+
raise Exception(f"CSV νμΌ λ‘λ μ€ν¨: {repo_id}/{filename}")
|
| 31 |
+
|
| 32 |
+
return df
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def load_csv_with_env_token(repo_id: str, filename: str) -> pd.DataFrame:
|
| 36 |
+
"""
|
| 37 |
+
νκ²½λ³μμ ν ν°μ μ¬μ©νμ¬ CSV νμΌμ λ‘λν©λλ€.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
repo_id: Repository ID
|
| 41 |
+
filename: CSV νμΌλͺ
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
pandas DataFrame
|
| 45 |
+
|
| 46 |
+
Raises:
|
| 47 |
+
Exception: λ‘λ μ€ν¨μ
|
| 48 |
+
"""
|
| 49 |
+
return quick_load_csv(repo_id, filename, token=None)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def load_freshqa_results(repo_id: str, filename: str = "results.csv", token: str = None) -> pd.DataFrame:
|
| 53 |
+
"""
|
| 54 |
+
FreshQA νκ° κ²°κ³Ό CSV νμΌμ λ‘λν©λλ€.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
repo_id: Repository ID
|
| 58 |
+
filename: κ²°κ³Ό νμΌλͺ
(κΈ°λ³Έκ°: "results.csv")
|
| 59 |
+
token: Hugging Face ν ν°
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
pandas DataFrame
|
| 63 |
+
"""
|
| 64 |
+
df = quick_load_csv(repo_id, filename, token)
|
| 65 |
+
|
| 66 |
+
# FreshQA κ²°κ³Όμ νμν 컬λΌλ€μ΄ μλμ§ νμΈ
|
| 67 |
+
required_columns = ['id', 'accuracy', 'evaluation_date']
|
| 68 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
| 69 |
+
|
| 70 |
+
# if missing_columns:
|
| 71 |
+
# print(f"β οΈ κ²½κ³ : λ€μ 컬λΌλ€μ΄ μμ΅λλ€: {missing_columns}")
|
| 72 |
+
|
| 73 |
+
return df
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def merge_with_leaderboard(new_results_df: pd.DataFrame,
|
| 77 |
+
leaderboard_path: str = "data/leaderboard_results.csv") -> pd.DataFrame:
|
| 78 |
+
"""
|
| 79 |
+
μλ‘μ΄ κ²°κ³Όλ₯Ό κΈ°μ‘΄ 리λ보λμ λ³ν©ν©λλ€.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
new_results_df: μλ‘μ΄ κ²°κ³Ό DataFrame
|
| 83 |
+
leaderboard_path: κΈ°μ‘΄ 리λ보λ νμΌ κ²½λ‘
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
λ³ν©λ DataFrame
|
| 87 |
+
"""
|
| 88 |
+
try:
|
| 89 |
+
# κΈ°μ‘΄ 리λ보λ λ‘λ
|
| 90 |
+
existing_df = pd.read_csv(leaderboard_path)
|
| 91 |
+
|
| 92 |
+
# λ³ν©
|
| 93 |
+
merged_df = pd.concat([existing_df, new_results_df], ignore_index=True)
|
| 94 |
+
|
| 95 |
+
# μ€λ³΅ μ κ±° (λμΌν idμ evaluation_date μ‘°ν©)
|
| 96 |
+
if 'id' in merged_df.columns and 'evaluation_date' in merged_df.columns:
|
| 97 |
+
merged_df = merged_df.drop_duplicates(
|
| 98 |
+
subset=['id', 'evaluation_date'],
|
| 99 |
+
keep='last'
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# μ λ ¬ (accuracy κΈ°μ€)
|
| 103 |
+
if 'accuracy' in merged_df.columns:
|
| 104 |
+
merged_df = merged_df.sort_values('accuracy', ascending=False)
|
| 105 |
+
|
| 106 |
+
# μ μ₯
|
| 107 |
+
merged_df.to_csv(leaderboard_path, index=False)
|
| 108 |
+
|
| 109 |
+
return merged_df
|
| 110 |
+
|
| 111 |
+
except FileNotFoundError:
|
| 112 |
+
# κΈ°μ‘΄ 리λ보λκ° μμΌλ©΄ μλ‘ μμ±
|
| 113 |
+
new_results_df.to_csv(leaderboard_path, index=False)
|
| 114 |
+
return new_results_df
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# μ¬μ© μμ
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
# μμ 1: κ°μ₯ κ°λ¨ν μ¬μ©λ²
|
| 120 |
+
# === μμ 1: κ°λ¨ν μ¬μ©λ² ===
|
| 121 |
+
try:
|
| 122 |
+
df = quick_load_csv(
|
| 123 |
+
repo_id="username/private-dataset",
|
| 124 |
+
filename="data.csv",
|
| 125 |
+
token="your_token_here" # μ€μ ν ν°μΌλ‘ λ³κ²½
|
| 126 |
+
)
|
| 127 |
+
# print(f"β
λ‘λ μ±κ³΅: {len(df)} ν, {len(df.columns)} μ΄")
|
| 128 |
+
# print(f"컬λΌ: {list(df.columns)}")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"β μ€λ₯: {e}")
|
| 131 |
+
|
| 132 |
+
# μμ 2: νκ²½λ³μ ν ν° μ¬μ©
|
| 133 |
+
# === μμ 2: νκ²½λ³μ ν ν° μ¬μ© ===
|
| 134 |
+
try:
|
| 135 |
+
df = load_csv_with_env_token(
|
| 136 |
+
repo_id="username/private-dataset",
|
| 137 |
+
filename="data.csv"
|
| 138 |
+
)
|
| 139 |
+
# print(f"β
λ‘λ μ±κ³΅: {len(df)} ν, {len(df.columns)} μ΄")
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f"β μ€λ₯: {e}")
|
| 142 |
+
|
| 143 |
+
# μμ 3: FreshQA κ²°κ³Ό λ‘λ λ° λ³ν©
|
| 144 |
+
# === μμ 3: FreshQA κ²°κ³Ό λ‘λ λ° λ³ν© ===
|
| 145 |
+
try:
|
| 146 |
+
# FreshQA κ²°κ³Ό λ‘λ
|
| 147 |
+
results_df = load_freshqa_results(
|
| 148 |
+
repo_id="user/freshqa-results",
|
| 149 |
+
filename="evaluation_results.csv",
|
| 150 |
+
token="your_token_here" # μ€μ ν ν°μΌλ‘ λ³κ²½
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# 리λ보λμ λ³ν©
|
| 154 |
+
merged_df = merge_with_leaderboard(results_df)
|
| 155 |
+
# print(f"β
λ³ν© μλ£: μ΄ {len(merged_df)} κ° κ²°κ³Ό")
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"β μ€λ₯: {e}")
|
src/submission_handler.py
ADDED
|
@@ -0,0 +1,615 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import queue
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Any, Optional, Dict, Tuple, Callable
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import gradio as gr
|
| 10 |
+
|
| 11 |
+
from config import Config
|
| 12 |
+
from src.submission_tracker import get_submission_tracker, SubmissionTracker
|
| 13 |
+
from src.quick_csv_loader import quick_load_csv
|
| 14 |
+
from src.leaderboard_manager import append_to_leaderboard_data
|
| 15 |
+
from src.utils import get_current_datetime_str
|
| 16 |
+
from freshqa.fresheval_parallel import evaluate_dataframe_parallel
|
| 17 |
+
from freshqa.freshqa_acc import process_freshqa_dataframe, calculate_accuracy
|
| 18 |
+
from freshqa.merge_csv_with_model_response import merge_dataframe_with_model_response_df
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# -------------------------
|
| 22 |
+
# κ³΅ν΅ λ°νν(Result)
|
| 23 |
+
# -------------------------
|
| 24 |
+
@dataclass
|
| 25 |
+
class Result:
|
| 26 |
+
ok: bool
|
| 27 |
+
data: Optional[Any] = None
|
| 28 |
+
error: Optional[str] = None
|
| 29 |
+
meta: Optional[Dict] = None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# -------------------------
|
| 33 |
+
# ν΅μ¬ νΈλ€λ¬
|
| 34 |
+
# -------------------------
|
| 35 |
+
class SubmissionHandler:
|
| 36 |
+
"""
|
| 37 |
+
μ μΆ νμΌ μ²λ¦¬ λ° FreshQA νκ° μ€μΌμ€νΈλ μ΄μ
.
|
| 38 |
+
- Tracker/Config μμ‘΄μ± μ£Όμ
|
| 39 |
+
- λ΄λΆ helperλ Result/λͺ
νν νμ
λ°ν
|
| 40 |
+
- μ€μ μ μ₯/νλ/μ¬μ©μ IDλ trackerκ° μ²λ¦¬(νΈλ€λ¬λ νΈμΆλ§)
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, tracker: Optional[SubmissionTracker] = None, cfg: Optional[type] = None):
|
| 44 |
+
# Dependency Injection
|
| 45 |
+
self.tracker = tracker
|
| 46 |
+
self.cfg = cfg or Config
|
| 47 |
+
|
| 48 |
+
# κΈ°μ‘΄ μ½λμ νΈνλλ μμ± (Config μ§μ μ°Έμ‘° μ κ±°)
|
| 49 |
+
self.enable_limit = getattr(self.cfg, "ENABLE_SUBMISSION_LIMIT", False)
|
| 50 |
+
self.repo_id = getattr(self.cfg, "FRESHQA_DATA_REPO_ID", None)
|
| 51 |
+
self.filename = getattr(self.cfg, "FRESHQA_DATA_FILENAME", None)
|
| 52 |
+
self.hf_token = getattr(self.cfg, "HF_TOKEN", None)
|
| 53 |
+
|
| 54 |
+
# νμ μ€μ μ κ²
|
| 55 |
+
if not self.repo_id:
|
| 56 |
+
raise ValueError("β FRESHQA_DATA_REPO_ID νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€.")
|
| 57 |
+
if not self.filename:
|
| 58 |
+
raise ValueError("β FRESHQA_DATA_FILENAME νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€.")
|
| 59 |
+
if not self.hf_token:
|
| 60 |
+
raise ValueError("β HF_TOKEN νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€.")
|
| 61 |
+
|
| 62 |
+
# --------- 1) μ μΆ νμΌ κ²μ¦ ----------
|
| 63 |
+
def _validate_submission_file(self, file) -> Result:
|
| 64 |
+
if file is None:
|
| 65 |
+
return Result(ok=False, error="β CSV νμΌμ μ
λ‘λν΄μ£ΌμΈμ.")
|
| 66 |
+
try:
|
| 67 |
+
df = pd.read_csv(file.name)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
return Result(ok=False, error=f"β CSV λ‘λ© μ€ν¨: {e}")
|
| 70 |
+
|
| 71 |
+
required_columns = ["question", "model_response"]
|
| 72 |
+
for col in required_columns:
|
| 73 |
+
if col not in df.columns:
|
| 74 |
+
return Result(ok=False, error=f"β CSV νμΌμ 컬λΌμ '{col}'μ΄(κ°) μμ΅λλ€.")
|
| 75 |
+
if len(df) == 0:
|
| 76 |
+
return Result(ok=False, error="β CSV νμΌμ λ°μ΄ν°κ° μμ΅λλ€.")
|
| 77 |
+
if df["question"].isnull().any() or df["model_response"].isnull().any():
|
| 78 |
+
return Result(ok=False, error="β 'question' λλ 'model_response' 컬λΌμ λλ½λ κ°μ΄ μμ΅λλ€.")
|
| 79 |
+
|
| 80 |
+
return Result(ok=True)
|
| 81 |
+
|
| 82 |
+
# --------- 2) λΉ λ₯Έ λ‘λ© ----------
|
| 83 |
+
def _load_submission_df(self, file) -> Result:
|
| 84 |
+
try:
|
| 85 |
+
df = quick_load_csv(self.repo_id, self.filename, self.hf_token)
|
| 86 |
+
except Exception as e:
|
| 87 |
+
return Result(ok=False, error=f"β CSV λ‘λ© μ€ν¨: {e}")
|
| 88 |
+
return Result(ok=True, data=df)
|
| 89 |
+
|
| 90 |
+
# --------- 3) λ³ν© ----------
|
| 91 |
+
def _merge_with_base(self, submission_df: pd.DataFrame, file_name: str) -> Result:
|
| 92 |
+
try:
|
| 93 |
+
merged_df = merge_dataframe_with_model_response_df(submission_df, file_name)
|
| 94 |
+
return Result(ok=True, data=merged_df)
|
| 95 |
+
except Exception as e:
|
| 96 |
+
return Result(ok=False, error=f"β κΈ°μ€ λ°μ΄ν°μ λ³ν© μ€ν¨: {e}")
|
| 97 |
+
|
| 98 |
+
# --------- 4) νκ° ----------
|
| 99 |
+
def _evaluate_freshqa(
|
| 100 |
+
self,
|
| 101 |
+
merged_df: pd.DataFrame,
|
| 102 |
+
on_progress: Optional[Callable[[int, int, str], None]] = None,
|
| 103 |
+
) -> Result:
|
| 104 |
+
"""Relaxed/Strict λμ μ€ν + ν κΈ°λ° μ§νλ₯ κ°±μ """
|
| 105 |
+
q: "queue.Queue[Tuple[int, int, str]]" = queue.Queue()
|
| 106 |
+
|
| 107 |
+
# λ λͺ¨λ(Relaxed, Strict)λ₯Ό λ³λ ¬λ‘ μ²λ¦¬νλ―λ‘ μ΄ μ§ν λ¨μλ 2λ°°
|
| 108 |
+
total_items = len(merged_df) * 2
|
| 109 |
+
done_count = 0
|
| 110 |
+
|
| 111 |
+
def _drain_queue(block: bool = False):
|
| 112 |
+
nonlocal done_count
|
| 113 |
+
while True:
|
| 114 |
+
try:
|
| 115 |
+
item = q.get(block=block, timeout=0.05 if block else 0)
|
| 116 |
+
except Exception:
|
| 117 |
+
break
|
| 118 |
+
try:
|
| 119 |
+
# μ΅μ μ»€λ° κΈ°μ€: progress_queueμλ 1μ© μ¦κ°νλ μ μλ§ λ€μ΄μ΅λλ€.
|
| 120 |
+
if isinstance(item, int):
|
| 121 |
+
done_count += item
|
| 122 |
+
if on_progress:
|
| 123 |
+
remaining = max(total_items - done_count, 0)
|
| 124 |
+
desc_text = f"νκ° μ€... {done_count}/{total_items}"
|
| 125 |
+
on_progress(done_count, total_items, desc_text)
|
| 126 |
+
# νΉμ κ³Όκ±° ν¬λ§·(tuple)μ΄ λ€μ΄μ€λλΌλ λ°©μ΄μ μΌλ‘ μ²λ¦¬
|
| 127 |
+
elif isinstance(item, tuple) and len(item) == 3 and on_progress:
|
| 128 |
+
on_progress(item[0], item[1], item[2])
|
| 129 |
+
finally:
|
| 130 |
+
q.task_done()
|
| 131 |
+
|
| 132 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
with ThreadPoolExecutor(max_workers=2) as ex:
|
| 136 |
+
relaxed_f = ex.submit(
|
| 137 |
+
evaluate_dataframe_parallel,
|
| 138 |
+
df=merged_df,
|
| 139 |
+
mode="Relaxed",
|
| 140 |
+
on_item_done=None,
|
| 141 |
+
progress_queue=q,
|
| 142 |
+
)
|
| 143 |
+
strict_f = ex.submit(
|
| 144 |
+
evaluate_dataframe_parallel,
|
| 145 |
+
df=merged_df,
|
| 146 |
+
mode="Strict",
|
| 147 |
+
on_item_done=None,
|
| 148 |
+
progress_queue=q,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
while True:
|
| 152 |
+
_drain_queue(block=False)
|
| 153 |
+
if relaxed_f.done() and strict_f.done():
|
| 154 |
+
break
|
| 155 |
+
time.sleep(0.05)
|
| 156 |
+
|
| 157 |
+
_drain_queue(block=True)
|
| 158 |
+
|
| 159 |
+
relaxed = relaxed_f.result()
|
| 160 |
+
strict = strict_f.result()
|
| 161 |
+
|
| 162 |
+
return Result(ok=True, data=(relaxed, strict))
|
| 163 |
+
except Exception as e:
|
| 164 |
+
return Result(ok=False, error=f"β νκ° μ€ μ€λ₯ λ°μ: {e}")
|
| 165 |
+
|
| 166 |
+
# --------- 5) μ νλ κ³μ° ----------
|
| 167 |
+
def _calculate_accuracy(self, fresheval_df: pd.DataFrame) -> Result:
|
| 168 |
+
try:
|
| 169 |
+
processed = process_freshqa_dataframe(fresheval_df)
|
| 170 |
+
accs, counts = calculate_accuracy(processed)
|
| 171 |
+
return Result(ok=True, data=(processed, accs, counts))
|
| 172 |
+
except Exception as e:
|
| 173 |
+
return Result(ok=False, error=f"β κ²°κ³Ό μ§κ³ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {e}")
|
| 174 |
+
|
| 175 |
+
# --------- 6) μμ½ ----------
|
| 176 |
+
def _build_summary(self, name: str, relaxed_accs: dict, strict_accs: dict) -> str:
|
| 177 |
+
"""
|
| 178 |
+
result_summaryκ° κΈ°λνλ μ΄μ λ¬Έμμ΄ ν¬λ§·μ κ·Έλλ‘ μ μ§ν©λλ€.
|
| 179 |
+
- ν€λ/μΉμ
μ λͺ©/μ€λ°κΏ/νν(μμμ 1μ리) λμΌ
|
| 180 |
+
- ν
μ€νΈμ
κΈ°μ€ μ§ν: acc_test, *_fast_changing, *_two_hop, *_old, *_new, *_vp, *_fp
|
| 181 |
+
"""
|
| 182 |
+
submitter = name if name else "(μ΄λ¦ λ―Έμ
λ ₯)"
|
| 183 |
+
|
| 184 |
+
lines = []
|
| 185 |
+
lines.append(f"**μ μΆμ**: {submitter}")
|
| 186 |
+
lines.append("")
|
| 187 |
+
lines.append("**μ νλ (ν
μ€νΈμ
κΈ°μ€)**")
|
| 188 |
+
lines.append(f"- Relaxed: {relaxed_accs.get('acc_test', 0):.1f}%")
|
| 189 |
+
lines.append(f"- Strict: {strict_accs.get('acc_test', 0):.1f}%")
|
| 190 |
+
lines.append("")
|
| 191 |
+
lines.append("**μΈλΆ μ§ν (ν
μ€νΈμ
)**")
|
| 192 |
+
lines.append(
|
| 193 |
+
f"- Fast Changing: R {relaxed_accs.get('acc_test_fast_changing', 0):.1f}% / "
|
| 194 |
+
f"S {strict_accs.get('acc_test_fast_changing', 0):.1f}%"
|
| 195 |
+
)
|
| 196 |
+
lines.append(
|
| 197 |
+
f"- Two-hop: R {relaxed_accs.get('acc_test_two_hop', 0):.1f}% / "
|
| 198 |
+
f"S {strict_accs.get('acc_test_two_hop', 0):.1f}%"
|
| 199 |
+
)
|
| 200 |
+
lines.append(
|
| 201 |
+
f"- Old: R {relaxed_accs.get('acc_test_old', 0):.1f}% / "
|
| 202 |
+
f"S {strict_accs.get('acc_test_old', 0):.1f}%"
|
| 203 |
+
)
|
| 204 |
+
lines.append(
|
| 205 |
+
f"- New: R {relaxed_accs.get('acc_test_new', 0):.1f}% / "
|
| 206 |
+
f"S {strict_accs.get('acc_test_new', 0):.1f}%"
|
| 207 |
+
)
|
| 208 |
+
lines.append(
|
| 209 |
+
f"- VP: R {relaxed_accs.get('acc_test_vp', 0):.1f}% / "
|
| 210 |
+
f"S {strict_accs.get('acc_test_vp', 0):.1f}%"
|
| 211 |
+
)
|
| 212 |
+
lines.append(
|
| 213 |
+
f"- FP: R {relaxed_accs.get('acc_test_fp', 0):.1f}% / "
|
| 214 |
+
f"S {strict_accs.get('acc_test_fp', 0):.1f}%"
|
| 215 |
+
)
|
| 216 |
+
return "\n".join(lines)
|
| 217 |
+
|
| 218 |
+
def _get_result_summary(
|
| 219 |
+
self,
|
| 220 |
+
file_name: str,
|
| 221 |
+
name: str,
|
| 222 |
+
relaxed_accs: dict,
|
| 223 |
+
strict_accs: dict,
|
| 224 |
+
relaxed_table: pd.DataFrame,
|
| 225 |
+
strict_table: pd.DataFrame,
|
| 226 |
+
) -> str:
|
| 227 |
+
# 보기 μ’μ ν¬λ§·μΌλ‘ μΌμͺ½ μ λ ¬/ꡬλΆμ /μ¬λ°±μ μ μ©ν΄ λ¬Έμμ΄ κ΅¬μ±
|
| 228 |
+
display_file = os.path.basename(file_name) if file_name else ""
|
| 229 |
+
lines: list[str] = []
|
| 230 |
+
lines.append("β
μ μΆ λ° νκ° μλ£")
|
| 231 |
+
lines.append("")
|
| 232 |
+
lines.append("[κΈ°λ³Έ μ 보]")
|
| 233 |
+
lines.append(f"- μ μΆ νμΌ: {display_file}")
|
| 234 |
+
lines.append(f"- νκ° μμ€ν
: Solar Pro API")
|
| 235 |
+
lines.append("")
|
| 236 |
+
lines.append("[κ²°κ³Ό μμ½]")
|
| 237 |
+
lines.append("- Relaxed λͺ¨λ")
|
| 238 |
+
lines.append(f" Β· μ 체 μ νλ: {float(relaxed_accs.get('acc', 0)):.1f}%")
|
| 239 |
+
lines.append(
|
| 240 |
+
f" Β· Fast-changing: {float(relaxed_accs.get('acc_fast_changing', 0)):.1f}% | "
|
| 241 |
+
f"Slow-changing: {float(relaxed_accs.get('acc_slow_changing', 0)):.1f}% | "
|
| 242 |
+
f"Never-changing: {float(relaxed_accs.get('acc_never_changing', 0)):.1f}%"
|
| 243 |
+
)
|
| 244 |
+
lines.append(f" Β· False premise: {float(relaxed_accs.get('acc_fp', 0)):.1f}%")
|
| 245 |
+
lines.append("")
|
| 246 |
+
lines.append("- Strict λͺ¨λ")
|
| 247 |
+
lines.append(f" Β· μ 체 μ νλ: {float(strict_accs.get('acc', 0)):.1f}%")
|
| 248 |
+
lines.append(
|
| 249 |
+
f" Β· Fast-changing: {float(strict_accs.get('acc_fast_changing', 0)):.1f}% | "
|
| 250 |
+
f"Slow-changing: {float(strict_accs.get('acc_slow_changing', 0)):.1f}% | "
|
| 251 |
+
f"Never-changing: {float(strict_accs.get('acc_never_changing', 0)):.1f}%"
|
| 252 |
+
)
|
| 253 |
+
lines.append(f" Β· False premise: {float(strict_accs.get('acc_fp', 0)):.1f}%")
|
| 254 |
+
lines.append("")
|
| 255 |
+
lines.append("[μ μΆ λ©ν]")
|
| 256 |
+
lines.append(f"- μ μΆμ: {name if name else 'Unknown'}")
|
| 257 |
+
lines.append(f"- νκ° μΌμ: {get_current_datetime_str()}")
|
| 258 |
+
lines.append(f"- λΉκ³ : Relaxed/Strict κ²°κ³Όκ° λ¦¬λ보λμ λ°μλμμ΅λλ€.")
|
| 259 |
+
lines.append("")
|
| 260 |
+
sep = "-" * 60
|
| 261 |
+
lines.append(sep)
|
| 262 |
+
lines.append("μμΈ κ²°κ³Ό ν
μ΄λΈ (Relaxed)")
|
| 263 |
+
lines.append(sep)
|
| 264 |
+
lines.append(relaxed_table.to_string(index=False))
|
| 265 |
+
lines.append("")
|
| 266 |
+
lines.append(sep)
|
| 267 |
+
lines.append("μμΈ κ²°κ³Ό ν
μ΄λΈ (Strict)")
|
| 268 |
+
lines.append(sep)
|
| 269 |
+
lines.append(strict_table.to_string(index=False))
|
| 270 |
+
return "\n".join(lines)
|
| 271 |
+
|
| 272 |
+
# --------- 7) μ νλ ν ----------
|
| 273 |
+
def _create_detailed_results_table(self, accs: dict, counts: dict) -> pd.DataFrame:
|
| 274 |
+
table_data = []
|
| 275 |
+
|
| 276 |
+
# μ 체 μ νλ
|
| 277 |
+
table_data.append({
|
| 278 |
+
'μΉ΄ν
κ³ λ¦¬': 'μ 체 μ νλ',
|
| 279 |
+
'μ 체': f"{accs.get('acc', 0):.1f}% ({counts.get('acc', 0)}κ°)",
|
| 280 |
+
'ν
μ€νΈ': f"{accs.get('acc_test', 0):.1f}% ({counts.get('acc_test', 0)}κ°)",
|
| 281 |
+
'κ°λ°': f"{accs.get('acc_dev', 0):.1f}% ({counts.get('acc_dev', 0)}κ°)"
|
| 282 |
+
})
|
| 283 |
+
|
| 284 |
+
# μ¬μ€ μ νλ³ μ νλ
|
| 285 |
+
fact_types = {
|
| 286 |
+
'fast_changing': 'λΉ λ₯΄κ² λ³νλ μ¬μ€',
|
| 287 |
+
'slow_changing': 'μ²μ²ν λ³νλ μ¬μ€',
|
| 288 |
+
'never_changing': 'λ³νμ§ μλ μ¬μ€'
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
for key, name in fact_types.items():
|
| 292 |
+
table_data.append({
|
| 293 |
+
'μΉ΄ν
κ³ λ¦¬': name,
|
| 294 |
+
'μ 체': f"{accs.get(f'acc_{key}', 0):.1f}% ({counts.get(f'acc_{key}', 0)}κ°)",
|
| 295 |
+
'ν
μ€νΈ': f"{accs.get(f'acc_test_{key}', 0):.1f}% ({counts.get(f'acc_test_{key}', 0)}κ°)",
|
| 296 |
+
'κ°λ°': f"{accs.get(f'acc_dev_{key}', 0):.1f}% ({counts.get(f'acc_dev_{key}', 0)}κ°)"
|
| 297 |
+
})
|
| 298 |
+
|
| 299 |
+
# μ§λ¬Έ μ νλ³ μ νλ
|
| 300 |
+
question_types = {
|
| 301 |
+
'vp': 'μ ν¨ν μ μ (Valid Premise)',
|
| 302 |
+
'fp': 'μλͺ»λ μ μ (False Premise)'
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
for key, name in question_types.items():
|
| 306 |
+
table_data.append({
|
| 307 |
+
'μΉ΄ν
κ³ λ¦¬': name,
|
| 308 |
+
'μ 체': f"{accs.get(f'acc_{key}', 0):.1f}% ({counts.get(f'acc_{key}', 0)}κ°)",
|
| 309 |
+
'ν
μ€νΈ': f"{accs.get(f'acc_test_{key}', 0):.1f}% ({counts.get(f'acc_test_{key}', 0)}κ°)",
|
| 310 |
+
'κ°λ°': f"{accs.get(f'acc_dev_{key}', 0):.1f}% ({counts.get(f'acc_dev_{key}', 0)}κ°)"
|
| 311 |
+
})
|
| 312 |
+
|
| 313 |
+
# ν μλ³ μ νλ
|
| 314 |
+
table_data.append({
|
| 315 |
+
'μΉ΄ν
κ³ λ¦¬': f" β {name} (λ¨μΌ ν)",
|
| 316 |
+
'μ 체': f"{accs.get(f'acc_{key}_one_hop', 0):.1f}% ({counts.get(f'acc_{key}_one_hop', 0)}κ°)",
|
| 317 |
+
'ν
μ€νΈ': f"{accs.get(f'acc_test_{key}_one_hop', 0):.1f}% ({counts.get(f'acc_test_{key}_one_hop', 0)}κ°)",
|
| 318 |
+
'κ°λ°': f"{accs.get(f'acc_dev_{key}_one_hop', 0):.1f}% ({counts.get(f'acc_dev_{key}_one_hop', 0)}κ°)"
|
| 319 |
+
})
|
| 320 |
+
|
| 321 |
+
table_data.append({
|
| 322 |
+
'μΉ΄ν
κ³ λ¦¬': f" β {name} (λ€μ€ ν)",
|
| 323 |
+
'μ 체': f"{accs.get(f'acc_{key}_two_hop', 0):.1f}% ({counts.get(f'acc_{key}_two_hop', 0)}κ°)",
|
| 324 |
+
'ν
μ€νΈ': f"{accs.get(f'acc_test_{key}_two_hop', 0):.1f}% ({counts.get(f'acc_test_{key}_two_hop', 0)}κ°)",
|
| 325 |
+
'κ°λ°': f"{accs.get(f'acc_dev_{key}_two_hop', 0):.1f}% ({counts.get(f'acc_dev_{key}_two_hop', 0)}κ°)"
|
| 326 |
+
})
|
| 327 |
+
|
| 328 |
+
# μ°λλ³ μ νλ
|
| 329 |
+
table_data.append({
|
| 330 |
+
'μΉ΄ν
κ³ λ¦¬': f" β {name} (μ€λλ λ°μ΄ν°)",
|
| 331 |
+
'μ 체': f"{accs.get(f'acc_{key}_old', 0):.1f}% ({counts.get(f'acc_{key}_old', 0)}κ°)",
|
| 332 |
+
'ν
μ€νΈ': f"{accs.get(f'acc_test_{key}_old', 0):.1f}% ({counts.get(f'acc_test_{key}_old', 0)}κ°)",
|
| 333 |
+
'κ°λ°': f"{accs.get(f'acc_dev_{key}_old', 0):.1f}% ({counts.get(f'acc_dev_{key}_old', 0)}κ°)"
|
| 334 |
+
})
|
| 335 |
+
|
| 336 |
+
table_data.append({
|
| 337 |
+
'μΉ΄ν
κ³ λ¦¬': f" β {name} (μ΅οΏ½οΏ½ λ°μ΄ν°)",
|
| 338 |
+
'μ 체': f"{accs.get(f'acc_{key}_new', 0):.1f}% ({counts.get(f'acc_{key}_new', 0)}κ°)",
|
| 339 |
+
'ν
μ€νΈ': f"{accs.get(f'acc_test_{key}_new', 0):.1f}% ({counts.get(f'acc_test_{key}_new', 0)}κ°)",
|
| 340 |
+
'κ°λ°': f"{accs.get(f'acc_dev_{key}_new', 0):.1f}% ({counts.get(f'acc_dev_{key}_new', 0)}κ°)"
|
| 341 |
+
})
|
| 342 |
+
|
| 343 |
+
return pd.DataFrame(table_data)
|
| 344 |
+
|
| 345 |
+
# --------- 8) 리λ보λ ν μμ± ----------
|
| 346 |
+
def _build_leaderboard_rows(
|
| 347 |
+
self,
|
| 348 |
+
name: str,
|
| 349 |
+
submit_model: str,
|
| 350 |
+
submit_description: Optional[str],
|
| 351 |
+
mode: str,
|
| 352 |
+
accs: dict
|
| 353 |
+
):
|
| 354 |
+
submitter_id = f"{name}".strip()
|
| 355 |
+
result = {
|
| 356 |
+
'id': submitter_id if submitter_id else "Unknown",
|
| 357 |
+
'model': submit_model,
|
| 358 |
+
'description': submit_description,
|
| 359 |
+
'accuracy': float(accs.get('acc_test', 0)),
|
| 360 |
+
'fast_changing_accuracy': float(accs.get('acc_test_fast_changing', 0)),
|
| 361 |
+
'slow_changing_accuracy': float(accs.get('acc_test_slow_changing', 0)),
|
| 362 |
+
'never_changing_accuracy': float(accs.get('acc_test_never_changing', 0)),
|
| 363 |
+
'acc_vp': float(accs.get('acc_test_vp', 0)),
|
| 364 |
+
'acc_fp': float(accs.get('acc_test_fp', 0)),
|
| 365 |
+
'acc_vp_one_hop': float(accs.get('acc_test_vp_one_hop', 0)),
|
| 366 |
+
'acc_vp_two_hop': float(accs.get('acc_test_vp_two_hop', 0)),
|
| 367 |
+
'acc_fp_one_hop': float(accs.get('acc_test_fp_one_hop', 0)),
|
| 368 |
+
'acc_fp_two_hop': float(accs.get('acc_test_fp_two_hop', 0)),
|
| 369 |
+
'acc_vp_old': float(accs.get('acc_test_vp_old', 0)),
|
| 370 |
+
'acc_vp_new': float(accs.get('acc_test_vp_new', 0)),
|
| 371 |
+
'acc_fp_old': float(accs.get('acc_test_fp_old', 0)),
|
| 372 |
+
'acc_fp_new': float(accs.get('acc_test_fp_new', 0)),
|
| 373 |
+
# λλ©μΈλ³ μ νλ μΆκ° (test κ²°κ³Όλ§ μ¬μ©)
|
| 374 |
+
'acc_politics': float(accs.get('acc_test_politics', 0)),
|
| 375 |
+
'acc_sports': float(accs.get('acc_test_sports', 0)),
|
| 376 |
+
'acc_entertainment': float(accs.get('acc_test_entertainment', 0)),
|
| 377 |
+
'acc_weather': float(accs.get('acc_test_weather', 0)),
|
| 378 |
+
'acc_world': float(accs.get('acc_test_world', 0)),
|
| 379 |
+
'acc_economy': float(accs.get('acc_test_economy', 0)),
|
| 380 |
+
'acc_society': float(accs.get('acc_test_society', 0)),
|
| 381 |
+
'acc_it_science': float(accs.get('acc_test_it_science', 0)),
|
| 382 |
+
'acc_life_culture': float(accs.get('acc_test_life_culture', 0)),
|
| 383 |
+
'acc_unknown': float(accs.get('acc_test_unknown', 0)),
|
| 384 |
+
'total_questions': int(accs.get('acc_test', 0)),
|
| 385 |
+
'evaluation_date': get_current_datetime_str(),
|
| 386 |
+
'evaluation_mode': mode
|
| 387 |
+
}
|
| 388 |
+
return result
|
| 389 |
+
|
| 390 |
+
def _save_leaderboard(
|
| 391 |
+
self,
|
| 392 |
+
name: str,
|
| 393 |
+
submit_model: str,
|
| 394 |
+
submit_description: Optional[str],
|
| 395 |
+
relaxed_accs: dict,
|
| 396 |
+
strict_accs: dict
|
| 397 |
+
):
|
| 398 |
+
rows = [
|
| 399 |
+
self._build_leaderboard_rows(name, submit_model, submit_description, 'Relaxed', relaxed_accs),
|
| 400 |
+
self._build_leaderboard_rows(name, submit_model, submit_description, 'Strict', strict_accs),
|
| 401 |
+
]
|
| 402 |
+
try:
|
| 403 |
+
append_to_leaderboard_data(rows)
|
| 404 |
+
except Exception as e:
|
| 405 |
+
print(f"β οΈ λ¦¬λ보λ μ μ₯ μ€ν¨: {e}")
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
# --------- 9) κ³΅κ° μλν¬μΈνΈ(ν΅μ¬) ----------
|
| 409 |
+
def process_submission(
|
| 410 |
+
self,
|
| 411 |
+
file,
|
| 412 |
+
name: str,
|
| 413 |
+
submit_model: str,
|
| 414 |
+
submit_description: str,
|
| 415 |
+
progress: gr.Progress = gr.Progress()
|
| 416 |
+
) -> str:
|
| 417 |
+
"""
|
| 418 |
+
μ μΆ νμΌ μ²λ¦¬ λ° νκ°
|
| 419 |
+
- λ΄λΆ helperλ Result κΈ°λ°μΌλ‘ 리ν΄
|
| 420 |
+
- μ΅μ’
Gradio μΆλ ₯μ λ¬Έμμ΄(κΈ°μ‘΄ νΈν)
|
| 421 |
+
"""
|
| 422 |
+
start = time.time()
|
| 423 |
+
|
| 424 |
+
normalized_model = (submit_model or "").strip() or "Anonymous Model"
|
| 425 |
+
normalized_description_raw = (submit_description or "").strip()
|
| 426 |
+
normalized_description = normalized_description_raw if normalized_description_raw else None
|
| 427 |
+
|
| 428 |
+
# 1) μ μΆ μ ν νμΈ
|
| 429 |
+
tracker: Optional[SubmissionTracker] = None
|
| 430 |
+
if self.enable_limit:
|
| 431 |
+
tracker = self.tracker or get_submission_tracker()
|
| 432 |
+
if tracker is not None:
|
| 433 |
+
self.tracker = tracker
|
| 434 |
+
if self.enable_limit and tracker:
|
| 435 |
+
try:
|
| 436 |
+
can_submit, message, remaining = tracker.can_submit()
|
| 437 |
+
if not can_submit:
|
| 438 |
+
return f"β μ μΆ μ ν: {message}"
|
| 439 |
+
except Exception as e:
|
| 440 |
+
return f"β μ μΆ μ ν νμΈ μ€ν¨: {e}"
|
| 441 |
+
|
| 442 |
+
# 2) νμΌ κ²μ¦
|
| 443 |
+
progress(0.05, desc="μ μΆ νμΌ κ²μ¦ μ€...")
|
| 444 |
+
v = self._validate_submission_file(file)
|
| 445 |
+
if not v.ok:
|
| 446 |
+
return v.error or "β μ μΆ νμΌ κ²μ¦ μ€ν¨"
|
| 447 |
+
|
| 448 |
+
# 3) λ‘λ
|
| 449 |
+
progress(0.1, desc="κΈ°μ€ λ°μ΄ν° λ‘λ μ€...")
|
| 450 |
+
loaded = self._load_submission_df(file)
|
| 451 |
+
if not loaded.ok:
|
| 452 |
+
return loaded.error or "β CSV λ‘λ© μ€ν¨"
|
| 453 |
+
submission_df: pd.DataFrame = loaded.data
|
| 454 |
+
|
| 455 |
+
# 4) λ³ν©
|
| 456 |
+
progress(0.15, desc="κΈ°μ€ λ°μ΄ν°μ λ³ν© μ€...")
|
| 457 |
+
mg = self._merge_with_base(submission_df, file.name)
|
| 458 |
+
if not mg.ok:
|
| 459 |
+
return mg.error or "β κΈ°μ€ λ°μ΄ν° λ³ν© μ€ν¨"
|
| 460 |
+
merged_df: pd.DataFrame = mg.data
|
| 461 |
+
|
| 462 |
+
# 5) νκ° (0.15 ~ 0.9 κ΅¬κ° μ§νλ₯ λ§€ν)
|
| 463 |
+
progress(0.15, desc="FreshQA νκ° μ€λΉ μ€...")
|
| 464 |
+
|
| 465 |
+
def on_inner_progress(done: int, total: int, desc: str):
|
| 466 |
+
frac = 0.15 + 0.75 * (done / max(total, 1))
|
| 467 |
+
progress(frac, desc=desc)
|
| 468 |
+
|
| 469 |
+
ev = self._evaluate_freshqa(merged_df, on_progress=on_inner_progress)
|
| 470 |
+
if not ev.ok:
|
| 471 |
+
# μ€ν¨ κΈ°λ‘
|
| 472 |
+
if self.enable_limit and tracker:
|
| 473 |
+
try:
|
| 474 |
+
tracker.record_submission(
|
| 475 |
+
name,
|
| 476 |
+
os.path.basename(file.name),
|
| 477 |
+
success=False,
|
| 478 |
+
error_message=ev.error or "νκ° μ€ν¨",
|
| 479 |
+
submit_model=normalized_model,
|
| 480 |
+
submit_description=normalized_description,
|
| 481 |
+
)
|
| 482 |
+
except Exception:
|
| 483 |
+
pass
|
| 484 |
+
return ev.error or "β νκ° μ€ μ€λ₯κ° λ°μνμ΅λλ€"
|
| 485 |
+
|
| 486 |
+
relaxed_df, strict_df = ev.data # type: ignore[assignment]
|
| 487 |
+
|
| 488 |
+
# 6) κ²°κ³Ό μ§κ³
|
| 489 |
+
progress(0.8, desc="νκ° κ²°κ³Ό λΆμ μ€...")
|
| 490 |
+
r = self._calculate_accuracy(relaxed_df)
|
| 491 |
+
if not r.ok:
|
| 492 |
+
if self.enable_limit and tracker:
|
| 493 |
+
try:
|
| 494 |
+
tracker.record_submission(
|
| 495 |
+
name,
|
| 496 |
+
os.path.basename(file.name),
|
| 497 |
+
success=False,
|
| 498 |
+
error_message=r.error or "μ§κ³ μ€ν¨",
|
| 499 |
+
submit_model=normalized_model,
|
| 500 |
+
submit_description=normalized_description,
|
| 501 |
+
)
|
| 502 |
+
except Exception:
|
| 503 |
+
pass
|
| 504 |
+
return r.error or "β κ²°κ³Ό μ§κ³ μ€ν¨"
|
| 505 |
+
|
| 506 |
+
s = self._calculate_accuracy(strict_df)
|
| 507 |
+
if not s.ok:
|
| 508 |
+
if self.enable_limit and tracker:
|
| 509 |
+
try:
|
| 510 |
+
tracker.record_submission(
|
| 511 |
+
name,
|
| 512 |
+
os.path.basename(file.name),
|
| 513 |
+
success=False,
|
| 514 |
+
error_message=s.error or "μ§κ³ μ€ν¨",
|
| 515 |
+
submit_model=normalized_model,
|
| 516 |
+
submit_description=normalized_description,
|
| 517 |
+
)
|
| 518 |
+
except Exception:
|
| 519 |
+
pass
|
| 520 |
+
return s.error or "β κ²°κ³Ό μ§κ³ μ€ν¨"
|
| 521 |
+
|
| 522 |
+
relaxed_processed, relaxed_accs, relaxed_counts = r.data # type: ignore[misc]
|
| 523 |
+
strict_processed, strict_accs, strict_counts = s.data # type: ignore[misc]
|
| 524 |
+
|
| 525 |
+
# 7) μμ½/ν
|
| 526 |
+
relaxed_table = self._create_detailed_results_table(relaxed_accs, relaxed_counts)
|
| 527 |
+
strict_table = self._create_detailed_results_table(strict_accs, strict_counts)
|
| 528 |
+
|
| 529 |
+
result_summary = self._get_result_summary(
|
| 530 |
+
file_name=file.name if file else "",
|
| 531 |
+
name=name,
|
| 532 |
+
relaxed_accs=relaxed_accs,
|
| 533 |
+
strict_accs=strict_accs,
|
| 534 |
+
relaxed_table=relaxed_table,
|
| 535 |
+
strict_table=strict_table,
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
# 8) μ μΆ μ±κ³΅ κΈ°λ‘ λ° λ¦¬λ보λ μ μ₯
|
| 539 |
+
if self.enable_limit and tracker:
|
| 540 |
+
progress(0.85, desc="μ μΆ λ΄μ μ μ₯ μ€...")
|
| 541 |
+
save_ok = tracker.record_submission(
|
| 542 |
+
name,
|
| 543 |
+
os.path.basename(file.name),
|
| 544 |
+
success=True,
|
| 545 |
+
submit_model=normalized_model,
|
| 546 |
+
submit_description=normalized_description,
|
| 547 |
+
)
|
| 548 |
+
progress(0.9, desc="리λ보λ μ
λ°μ΄νΈ μ€...")
|
| 549 |
+
self._save_leaderboard(name, normalized_model, normalized_description, relaxed_accs, strict_accs)
|
| 550 |
+
else:
|
| 551 |
+
self._save_leaderboard(name, normalized_model, normalized_description, relaxed_accs, strict_accs)
|
| 552 |
+
|
| 553 |
+
# 9) κ²°κ³Ό λ¬Έμμ΄ κ΅¬μ±
|
| 554 |
+
progress(1.0, desc="μλ£")
|
| 555 |
+
return result_summary
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
# -------------------------
|
| 559 |
+
# λͺ¨λ-λ 벨 μνΈλ¦¬ν¬μΈνΈ (κΈ°μ‘΄ UI νΈν)
|
| 560 |
+
# -------------------------
|
| 561 |
+
def process_submission(
|
| 562 |
+
file,
|
| 563 |
+
name: str,
|
| 564 |
+
submit_model: str,
|
| 565 |
+
submit_description: str,
|
| 566 |
+
progress: gr.Progress = gr.Progress()
|
| 567 |
+
) -> str:
|
| 568 |
+
"""
|
| 569 |
+
Gradioμμ μ§μ νΈμΆνλ μνΈλ¦¬ν¬μΈνΈ.
|
| 570 |
+
λ΄λΆμ μΌλ‘ DIλ₯Ό μ μ©ν SubmissionHandlerλ₯Ό μμ±ν΄ νΈμΆνλ€.
|
| 571 |
+
"""
|
| 572 |
+
tracker = get_submission_tracker() if Config.ENABLE_SUBMISSION_LIMIT else None
|
| 573 |
+
handler = SubmissionHandler(tracker=tracker, cfg=Config)
|
| 574 |
+
try:
|
| 575 |
+
return handler.process_submission(
|
| 576 |
+
file=file,
|
| 577 |
+
name=name,
|
| 578 |
+
submit_model=submit_model,
|
| 579 |
+
submit_description=submit_description,
|
| 580 |
+
progress=progress,
|
| 581 |
+
)
|
| 582 |
+
except Exception as e:
|
| 583 |
+
# μ΅μμ 보νΈλ§: μμμΉ λͺ»ν μμΈλ μ¬μ©μ μΉνμ μΌλ‘ λ°ν
|
| 584 |
+
try:
|
| 585 |
+
tracking_user_id = None
|
| 586 |
+
if handler.enable_limit and handler.tracker:
|
| 587 |
+
# λκ° μ μΆνλμ§λ trackerκ° μκ³ μλ€λ©΄ κΈ°λ‘
|
| 588 |
+
try:
|
| 589 |
+
tracking_user_id = handler.tracker.get_user_id()
|
| 590 |
+
except Exception:
|
| 591 |
+
tracking_user_id = None
|
| 592 |
+
if handler.enable_limit and handler.tracker:
|
| 593 |
+
handler.tracker.record_submission(
|
| 594 |
+
name=name,
|
| 595 |
+
file_name=os.path.basename(file.name) if file else "(unknown)",
|
| 596 |
+
success=False,
|
| 597 |
+
error_message=str(e),
|
| 598 |
+
submit_model=(submit_model or "").strip() or "Anonymous Model",
|
| 599 |
+
submit_description=(submit_description or "").strip() or None,
|
| 600 |
+
)
|
| 601 |
+
except Exception:
|
| 602 |
+
# κΈ°λ‘ μ€ν¨λ μ‘°μ©ν 무μ
|
| 603 |
+
pass
|
| 604 |
+
|
| 605 |
+
total_time = 0.0 # μλ¨μμ μΈ‘μ νμ§ λͺ»νμ μ μμΌλ―λ‘ 0μΌλ‘
|
| 606 |
+
error_message = str(e)
|
| 607 |
+
|
| 608 |
+
return (
|
| 609 |
+
"β νκ° μ€ν¨\n\n"
|
| 610 |
+
"μ€λ₯ λ΄μ©:\n"
|
| 611 |
+
f"{error_message}\n\n"
|
| 612 |
+
f"μμ μκ°: {total_time:.2f}μ΄ ({total_time/60:.2f}λΆ)\n\n"
|
| 613 |
+
"μ μΆμ μ μμ μΌλ‘ μ²λ¦¬λμμ§λ§, νκ° κ³Όμ μμ μ€λ₯κ° λ°μνμ΅λλ€.\n"
|
| 614 |
+
"μ μΆ κΈ°λ‘μ μ μ₯λμμ΅λλ€."
|
| 615 |
+
)
|
src/submission_tracker.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
μ¬μ©μ μ μΆ μΆμ λͺ¨λ
|
| 3 |
+
HuggingFace μ¬μ©μ IDλ₯Ό κΈ°λ°μΌλ‘ ν루 3λ² μ ν κΈ°λ₯μ μ 곡ν©λλ€.
|
| 4 |
+
μ μΆ μ 보λ λ³λμ HuggingFace repositoryμμ κ΄λ¦¬λ©λλ€.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import tempfile
|
| 11 |
+
from datetime import datetime, date
|
| 12 |
+
from typing import Dict, List, Optional, Tuple
|
| 13 |
+
from huggingface_hub import whoami, hf_hub_download, login, HfApi
|
| 14 |
+
import pytz
|
| 15 |
+
from src.utils import file_lock, get_current_date_str, get_current_datetime_str
|
| 16 |
+
|
| 17 |
+
# νκ΅ μκ°λ μ€μ
|
| 18 |
+
KOREA_TZ = pytz.timezone('Asia/Seoul')
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class SubmissionTracker:
|
| 22 |
+
"""μ¬μ©μ μ μΆ μΆμ ν΄λμ€ - HuggingFace Repository κΈ°λ°"""
|
| 23 |
+
|
| 24 |
+
def __init__(self,
|
| 25 |
+
repo_id: Optional[str] = None,
|
| 26 |
+
token: Optional[str] = None,
|
| 27 |
+
filename: str = "user_submissions.json"):
|
| 28 |
+
"""
|
| 29 |
+
Args:
|
| 30 |
+
repo_id: HuggingFace repository ID (μ: "username/submission-tracker")
|
| 31 |
+
token: HuggingFace API ν ν° (Noneμ΄λ©΄ νκ²½λ³μμμ μλ λ‘λ)
|
| 32 |
+
filename: μ μΆ κΈ°λ‘ νμΌλͺ
|
| 33 |
+
"""
|
| 34 |
+
# νκ²½λ³μμμ μ€μ κ°μ Έμ€κΈ°
|
| 35 |
+
self.repo_id = repo_id or os.getenv('SUBMISSION_TRACKER_REPO_ID')
|
| 36 |
+
self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACE_HUB_TOKEN')
|
| 37 |
+
self.filename = filename
|
| 38 |
+
|
| 39 |
+
if not self.repo_id:
|
| 40 |
+
raise ValueError(
|
| 41 |
+
"SUBMISSION_TRACKER_REPO_ID νκ²½λ³μκ° μ€μ λμ§ μμμ΅λλ€. "
|
| 42 |
+
"λλ repo_idλ₯Ό μ§μ μ λ¬ν΄μ£ΌμΈμ."
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
if not self.token:
|
| 46 |
+
raise ValueError(
|
| 47 |
+
"HuggingFace ν ν°μ΄ νμν©λλ€. "
|
| 48 |
+
"ν ν°μ μ§μ μ λ¬νκ±°λ HF_TOKEN νκ²½λ³μλ₯Ό μ€μ νμΈμ."
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# HuggingFace API μ΄κΈ°ν
|
| 52 |
+
self.api = HfApi()
|
| 53 |
+
try:
|
| 54 |
+
login(token=self.token)
|
| 55 |
+
# β
HuggingFaceμ μ±κ³΅μ μΌλ‘ λ‘κ·ΈμΈλμμ΅λλ€.
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"β HuggingFace λ‘κ·ΈμΈ μ€ν¨: {e}")
|
| 58 |
+
raise
|
| 59 |
+
|
| 60 |
+
# μ μΆ κΈ°λ‘ λ‘λ
|
| 61 |
+
self.submissions = self.load_submissions()
|
| 62 |
+
|
| 63 |
+
def load_submissions(self) -> Dict:
|
| 64 |
+
"""HuggingFace repositoryμμ μ μΆ κΈ°λ‘ λ‘λ"""
|
| 65 |
+
try:
|
| 66 |
+
# π₯ HuggingFace repositoryμμ μ μΆ κΈ°λ‘ λ‘λ μ€: {self.repo_id}/{self.filename}
|
| 67 |
+
|
| 68 |
+
# μμ λλ ν 리μ νμΌ λ€μ΄λ‘λ
|
| 69 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 70 |
+
file_path = hf_hub_download(
|
| 71 |
+
repo_id=self.repo_id,
|
| 72 |
+
filename=self.filename,
|
| 73 |
+
local_dir=temp_dir,
|
| 74 |
+
repo_type="dataset",
|
| 75 |
+
token=self.token
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# JSON νμΌ λ‘λ
|
| 79 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 80 |
+
submissions = json.load(f)
|
| 81 |
+
|
| 82 |
+
# β
μ μΆ κΈ°λ‘ λ‘λ μλ£: {len(submissions)}λͺ
μ μ¬μ©μ κΈ°λ‘
|
| 83 |
+
return submissions
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"β οΈ μ μΆ κΈ°λ‘ λ‘λ μ€ν¨ (μλ‘ μμ): {e}")
|
| 87 |
+
return {}
|
| 88 |
+
|
| 89 |
+
def get_user_id(self) -> Optional[str]:
|
| 90 |
+
"""HuggingFaceμμ νμ¬ μ¬μ©μ ID κ°μ Έμ€κΈ° (κ³ μ ID μ¬μ©)"""
|
| 91 |
+
try:
|
| 92 |
+
user_info = whoami()
|
| 93 |
+
# κ³ μ ID μ¬μ© (λ³κ²½ λΆκ°λ₯ν μλ³μ)
|
| 94 |
+
return user_info.get("id", None)
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"β οΈ μ¬μ©μ ID κ°μ Έμ€κΈ° μ€ν¨: {e}")
|
| 97 |
+
raise Exception("β μ¬μ©μ IDλ₯Ό κ°μ Έμ¬ μ μμ΅λλ€. HuggingFaceμ λ‘κ·ΈμΈλμ΄ μλμ§ νμΈν΄μ£ΌμΈμ.")
|
| 98 |
+
|
| 99 |
+
def get_today_submissions(self, user_id: str) -> List[Dict]:
|
| 100 |
+
"""μ€λ μ¬μ©μμ μ μΆ κΈ°λ‘ κ°μ Έμ€κΈ°"""
|
| 101 |
+
today = get_current_date_str()
|
| 102 |
+
user_submissions = self.submissions.get(user_id, {})
|
| 103 |
+
return user_submissions.get(today, [])
|
| 104 |
+
|
| 105 |
+
def can_submit(self, submissions_data: Optional[Dict] = None) -> Tuple[bool, str, int]:
|
| 106 |
+
"""μ¬μ©μκ° μ μΆν μ μλμ§ νμΈ"""
|
| 107 |
+
user_id = self.get_user_id()
|
| 108 |
+
data = submissions_data if submissions_data is not None else self.submissions
|
| 109 |
+
today = get_current_date_str()
|
| 110 |
+
today_submissions = data.get(user_id, {}).get(today, [])
|
| 111 |
+
successful_count = len([s for s in today_submissions if s.get('success', False)])
|
| 112 |
+
|
| 113 |
+
if successful_count >= 3:
|
| 114 |
+
raise Exception("β μ€λ μ μΆ νλλ₯Ό μ΄κ³Όνμ΅λλ€. λ΄μΌ λ€μ μλν΄μ£ΌμΈμ.")
|
| 115 |
+
|
| 116 |
+
remaining = 3 - successful_count
|
| 117 |
+
return True, f"β
μ μΆ κ°λ₯ν©λλ€. (μ€λ {successful_count}/3ν μ¬μ©, {remaining}ν λ¨μ)", remaining
|
| 118 |
+
|
| 119 |
+
def record_submission(
|
| 120 |
+
self,
|
| 121 |
+
submitter_name: str,
|
| 122 |
+
file_name: str,
|
| 123 |
+
success: bool,
|
| 124 |
+
error_message: str = None,
|
| 125 |
+
submit_model: Optional[str] = None,
|
| 126 |
+
submit_description: Optional[str] = None
|
| 127 |
+
) -> bool:
|
| 128 |
+
"""μ μΆ κΈ°λ‘ μΆκ° (νμΌ μ κΈμΌλ‘ 보νΈ)"""
|
| 129 |
+
user_id = self.get_user_id()
|
| 130 |
+
|
| 131 |
+
# μ κΈ νμΌ κ²½λ‘ μμ±
|
| 132 |
+
lock_file_path = tempfile.gettempdir() + f'/{self.repo_id.replace("/", "_")}.lock'
|
| 133 |
+
|
| 134 |
+
# νμΌ μ κΈμΌλ‘ μ 체 κ³Όμ μ atomicνκ² λ³΄νΈ
|
| 135 |
+
with file_lock(lock_file_path):
|
| 136 |
+
try:
|
| 137 |
+
# μ΅μ λ°μ΄ν°λ₯Ό λ€μ λ‘λ (λ€λ₯Έ νλ‘μΈμ€μμ μ
λ°μ΄νΈνμ μ μμ)
|
| 138 |
+
latest_submissions = self.load_submissions()
|
| 139 |
+
|
| 140 |
+
# Lock λ΄λΆμμ μ΅μ λ°μ΄ν° κΈ°μ€μΌλ‘ μ μΆ κ°λ₯ μ¬λΆ μ¬νμΈ
|
| 141 |
+
try:
|
| 142 |
+
can_submit, message, _ = self.can_submit(submissions_data=latest_submissions)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
# μ μΆ μ ν μ΄κ³Ό μ
|
| 145 |
+
# μ μΆ μ ν μ΄κ³Ό λ©μμ§: {e}
|
| 146 |
+
# λ©λͺ¨λ¦¬λ§ μ
λ°μ΄νΈνκ³ μ μ₯νμ§ μμ
|
| 147 |
+
self.submissions = latest_submissions
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
# μλ‘μ΄ μ μΆ κΈ°λ‘ μΆκ°
|
| 151 |
+
current_datetime = get_current_datetime_str()
|
| 152 |
+
|
| 153 |
+
if user_id not in latest_submissions:
|
| 154 |
+
latest_submissions[user_id] = {}
|
| 155 |
+
|
| 156 |
+
today = get_current_date_str()
|
| 157 |
+
if today not in latest_submissions[user_id]:
|
| 158 |
+
latest_submissions[user_id][today] = []
|
| 159 |
+
|
| 160 |
+
submission_record = {
|
| 161 |
+
"timestamp": current_datetime,
|
| 162 |
+
"submitter_name": submitter_name,
|
| 163 |
+
"file_name": file_name,
|
| 164 |
+
"success": success,
|
| 165 |
+
"error_message": error_message,
|
| 166 |
+
"submit_model": submit_model,
|
| 167 |
+
"submit_description": submit_description
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
latest_submissions[user_id][today].append(submission_record)
|
| 171 |
+
|
| 172 |
+
# λ©λͺ¨λ¦¬ μ
λ°μ΄νΈ
|
| 173 |
+
self.submissions = latest_submissions
|
| 174 |
+
|
| 175 |
+
# μ μ₯
|
| 176 |
+
return self._save_submissions_internal(latest_submissions)
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f"β μ μΆ κΈ°λ‘ μΆκ° μ€ν¨: {e}")
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
def _save_submissions_internal(self, submissions_data: Dict) -> bool:
|
| 183 |
+
"""λ΄λΆ μ μ₯ ν¨μ (lockμ μ΄λ―Έ νλλ μν)"""
|
| 184 |
+
try:
|
| 185 |
+
# πΎ HuggingFace repositoryμ μ μΆ κΈ°λ‘ μ μ₯ μ€: {self.repo_id}/{self.filename}
|
| 186 |
+
|
| 187 |
+
# μμ νμΌμ JSON λ°μ΄ν° μ μ₯
|
| 188 |
+
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.json', delete=False) as temp_file:
|
| 189 |
+
json.dump(submissions_data, temp_file, ensure_ascii=False, indent=2)
|
| 190 |
+
temp_file_path = temp_file.name
|
| 191 |
+
|
| 192 |
+
# HuggingFace repositoryμ νμΌ μ
λ‘λ
|
| 193 |
+
self.api.upload_file(
|
| 194 |
+
path_or_fileobj=temp_file_path,
|
| 195 |
+
path_in_repo=self.filename,
|
| 196 |
+
repo_id=self.repo_id,
|
| 197 |
+
repo_type="dataset",
|
| 198 |
+
token=self.token,
|
| 199 |
+
commit_message=f"Update submission records - {datetime.now(KOREA_TZ).strftime('%Y-%m-%d %H:%M:%S')}"
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
# μμ νμΌ μμ
|
| 203 |
+
os.unlink(temp_file_path)
|
| 204 |
+
|
| 205 |
+
# β
μ μΆ κΈ°λ‘ μ μ₯ μλ£
|
| 206 |
+
return True
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"β μ μΆ κΈ°λ‘ μ μ₯ μ€ν¨: {e}")
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
def get_user_submission_history(self, user_id: str, days: int = 7) -> Dict:
|
| 213 |
+
"""μ¬μ©μμ μ΅κ·Ό μ μΆ κΈ°λ‘ κ°μ Έμ€κΈ°"""
|
| 214 |
+
if not user_id or user_id not in self.submissions:
|
| 215 |
+
return {}
|
| 216 |
+
|
| 217 |
+
user_submissions = self.submissions[user_id]
|
| 218 |
+
today = datetime.now(KOREA_TZ).date()
|
| 219 |
+
|
| 220 |
+
history = {}
|
| 221 |
+
for i in range(days):
|
| 222 |
+
check_date = today - pd.Timedelta(days=i)
|
| 223 |
+
date_str = check_date.strftime('%Y-%m-%d')
|
| 224 |
+
|
| 225 |
+
if date_str in user_submissions:
|
| 226 |
+
history[date_str] = user_submissions[date_str]
|
| 227 |
+
|
| 228 |
+
return history
|
| 229 |
+
|
| 230 |
+
def get_submission_stats(self, user_id: str) -> Dict:
|
| 231 |
+
"""μ¬μ©μ μ μΆ ν΅κ³ κ°μ Έμ€κΈ°"""
|
| 232 |
+
if not user_id:
|
| 233 |
+
return {}
|
| 234 |
+
|
| 235 |
+
today_submissions = self.get_today_submissions(user_id)
|
| 236 |
+
successful_today_count = len([s for s in today_submissions if s.get('success', False)])
|
| 237 |
+
history = self.get_user_submission_history(user_id, 7)
|
| 238 |
+
|
| 239 |
+
# ν΅κ³ κ³μ°
|
| 240 |
+
total_submissions = sum(len(day_submissions) for day_submissions in history.values())
|
| 241 |
+
successful_submissions = sum(
|
| 242 |
+
len([s for s in day_submissions if s.get('success', False)])
|
| 243 |
+
for day_submissions in history.values()
|
| 244 |
+
)
|
| 245 |
+
failed_submissions = total_submissions - successful_submissions
|
| 246 |
+
|
| 247 |
+
return {
|
| 248 |
+
"today_count": len(today_submissions),
|
| 249 |
+
"today_remaining": max(0, 3 - successful_today_count),
|
| 250 |
+
"week_total": total_submissions,
|
| 251 |
+
"week_successful": successful_submissions,
|
| 252 |
+
"week_failed": failed_submissions,
|
| 253 |
+
"history": history
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
def cleanup_old_records(self, days_to_keep: int = 30):
|
| 257 |
+
"""μ€λλ μ μΆ κΈ°λ‘ μ 리 (νμΌ μ κΈ μ¬μ©)"""
|
| 258 |
+
# μ κΈ νμΌ κ²½λ‘ μμ±
|
| 259 |
+
lock_file_path = tempfile.gettempdir() + f'/{self.repo_id.replace("/", "_")}.lock'
|
| 260 |
+
|
| 261 |
+
# νμΌ μ κΈμΌλ‘ μ 체 κ³Όμ μ atomicνκ² λ³΄νΈ
|
| 262 |
+
with file_lock(lock_file_path):
|
| 263 |
+
try:
|
| 264 |
+
# μ΅μ λ°μ΄ν°λ₯Ό λ€μ λ‘λ
|
| 265 |
+
latest_submissions = self.load_submissions()
|
| 266 |
+
|
| 267 |
+
cutoff_date = datetime.now(KOREA_TZ) - pd.Timedelta(days=days_to_keep)
|
| 268 |
+
cutoff_str = cutoff_date.strftime('%Y-%m-%d')
|
| 269 |
+
|
| 270 |
+
cleaned_count = 0
|
| 271 |
+
for user_id in list(latest_submissions.keys()):
|
| 272 |
+
user_submissions = latest_submissions[user_id]
|
| 273 |
+
for date_str in list(user_submissions.keys()):
|
| 274 |
+
if date_str < cutoff_str:
|
| 275 |
+
del user_submissions[date_str]
|
| 276 |
+
cleaned_count += 1
|
| 277 |
+
|
| 278 |
+
# λΉ μ¬μ©μ κΈ°λ‘ μ κ±°
|
| 279 |
+
if not user_submissions:
|
| 280 |
+
del latest_submissions[user_id]
|
| 281 |
+
|
| 282 |
+
# λ©λͺ¨λ¦¬ μ
λ°μ΄νΈ
|
| 283 |
+
self.submissions = latest_submissions
|
| 284 |
+
|
| 285 |
+
if cleaned_count > 0:
|
| 286 |
+
if self._save_submissions_internal(latest_submissions):
|
| 287 |
+
print(f"π§Ή {cleaned_count}κ°μ μ€λλ μ μΆ κΈ°λ‘μ μ 리νμ΅λλ€.")
|
| 288 |
+
else:
|
| 289 |
+
print(f"β οΈ {cleaned_count}κ°μ μ€λλ μ μΆ κΈ°λ‘μ μ 리νμ§λ§ μ μ₯μ μ€ν¨νμ΅λλ€.")
|
| 290 |
+
|
| 291 |
+
return cleaned_count
|
| 292 |
+
|
| 293 |
+
except Exception as e:
|
| 294 |
+
print(f"β μ€λλ κΈ°λ‘ μ 리 μ€ν¨: {e}")
|
| 295 |
+
return 0
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def get_submission_tracker() -> Optional[SubmissionTracker]:
|
| 299 |
+
"""SubmissionTracker μΈμ€ν΄μ€ λ°ν"""
|
| 300 |
+
try:
|
| 301 |
+
return SubmissionTracker()
|
| 302 |
+
except Exception as e:
|
| 303 |
+
print(f"β SubmissionTracker μ΄κΈ°ν μ€ν¨: {e}")
|
| 304 |
+
return None
|
src/utils.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
μ νΈλ¦¬ν° ν¨μ λͺ¨λ
|
| 3 |
+
곡ν΅μΌλ‘ μ¬μ©λλ μ νΈλ¦¬ν° ν¨μλ€μ λͺ¨μλμ λͺ¨λμ
λλ€.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import fcntl
|
| 8 |
+
import pytz
|
| 9 |
+
from contextlib import contextmanager
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
# νκ΅ μκ°λ μ€μ
|
| 13 |
+
KOREA_TZ = pytz.timezone('Asia/Seoul')
|
| 14 |
+
|
| 15 |
+
def get_korea_datetime_now():
|
| 16 |
+
"""νκ΅ μκ°λμ νμ¬ μκ°μ λ°ν"""
|
| 17 |
+
return datetime.now(KOREA_TZ)
|
| 18 |
+
|
| 19 |
+
def get_current_datetime_str(dt=None):
|
| 20 |
+
"""νκ΅ μκ°λμ μκ°μ λ¬Έμμ΄λ‘ ν¬λ§·"""
|
| 21 |
+
if dt is None:
|
| 22 |
+
dt = get_korea_datetime_now()
|
| 23 |
+
return dt.strftime('%Y-%m-%d %H:%M:%S')
|
| 24 |
+
|
| 25 |
+
def get_current_date_str():
|
| 26 |
+
"""νμ¬ λ μ§λ₯Ό νκ΅ μκ°μΌλ‘ λ°ν"""
|
| 27 |
+
return get_korea_datetime_now().strftime("%Y-%m-%d")
|
| 28 |
+
|
| 29 |
+
@contextmanager
|
| 30 |
+
def file_lock(lock_file_path):
|
| 31 |
+
"""
|
| 32 |
+
νμΌ κΈ°λ° λ°°νμ μ κΈμ μ 곡νλ context manager
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
lock_file_path: μ κΈ νμΌ κ²½λ‘
|
| 36 |
+
|
| 37 |
+
Yields:
|
| 38 |
+
None (λ§₯λ½ κ΄λ¦¬μλ‘λ§ μ¬μ©)
|
| 39 |
+
|
| 40 |
+
Examples:
|
| 41 |
+
>>> with file_lock('/tmp/test.lock'):
|
| 42 |
+
... # μ κΈμ΄ κ±Έλ¦° μνμμ μμ
μν
|
| 43 |
+
... pass
|
| 44 |
+
"""
|
| 45 |
+
# μ κΈ νμΌμ΄ μμΌλ©΄ μμ±
|
| 46 |
+
if not os.path.exists(lock_file_path):
|
| 47 |
+
open(lock_file_path, 'w').close()
|
| 48 |
+
|
| 49 |
+
# μ κΈ νμΌμ μ΄κ³ λ°°νμ μ κΈ νλ
|
| 50 |
+
with open(lock_file_path, 'r') as lock_file:
|
| 51 |
+
try:
|
| 52 |
+
# λ°°νμ μ κΈ μλ (λ€λ₯Έ νλ‘μΈμ€κ° λκΈ°)
|
| 53 |
+
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
|
| 54 |
+
# μ κΈ νλ μ±κ³΅, μμ
μν
|
| 55 |
+
yield
|
| 56 |
+
finally:
|
| 57 |
+
# μ κΈ ν΄μ
|
| 58 |
+
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
ui/dataset_tab.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
λ°μ΄ν°μ
λ€μ΄λ‘λ ν UI μ»΄ν¬λνΈ
|
| 3 |
+
|
| 4 |
+
πΎ λ°μ΄ν°μ
λ€μ΄λ‘λ νμ UIμ λ‘μ§μ κ΄λ¦¬ν©λλ€.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def create_dataset_tab():
|
| 12 |
+
"""λ°μ΄ν°μ
λ€μ΄λ‘λ ν UI μμ±"""
|
| 13 |
+
|
| 14 |
+
# λ°μ΄ν°μ
미리보기 λ‘λ (μ΄κΈ°ν μ ν λ²λ§)
|
| 15 |
+
try:
|
| 16 |
+
dev_preview_data = pd.read_csv("data/public/ko-freshqa_2025_dev.csv").head(5)
|
| 17 |
+
test_preview_data = pd.read_csv("data/public/ko-freshqa_2025_test.csv").head(5)
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(f"β οΈ λ°μ΄ν°μ
미리보기 λ‘λ μ€ν¨: {e}")
|
| 20 |
+
dev_preview_data = pd.DataFrame()
|
| 21 |
+
test_preview_data = pd.DataFrame()
|
| 22 |
+
|
| 23 |
+
gr.Markdown("""
|
| 24 |
+
### Ko-FreshQA Dataset
|
| 25 |
+
|
| 26 |
+
- μ΄ λ°μ΄ν°μ
λ° λ¦¬λ보λλ [FreshQA](https://github.com/freshllms/freshqa)μμ μκ°μ λ°μ λ§λ€μ΄μ‘μ΅λλ€.
|
| 27 |
+
- fact type(fast changing, slow changing, never changing), μ μ μ μ ν¨μ±, 10κ°μ λλ©μΈμ λ°λΌ λλλ μ§λ¬Έλ€μ ν΅ν΄ νκ΅μ΄ μ§μκ³Ό κ΄λ ¨λ LLMμ μ΅μ μ±μ νλ¨ν μ μμ΅λλ€.
|
| 28 |
+
- κ²μ¦ λ° νκ°μ νμν λ°μ΄ν°μ
μ μ£ΌκΈ°μ μΌλ‘ μ
λ°μ΄νΈν μμ μ
λλ€.
|
| 29 |
+
|
| 30 |
+
<br>
|
| 31 |
+
|
| 32 |
+
### Ko-FreshQA λ°μ΄ν°μ
μ μλμ κ°μ νΉμ§μ κ°μ§κ³ μμ΅λλ€.
|
| 33 |
+
- **fact type**
|
| 34 |
+
- μκ°μ νλ¦μ λ°λ₯Έ λ΅λ³μ λ³λ κ°λ₯μ±μ λ°λΌ μ§λ¬Έμ μλμ μΈ κ°μ§λ‘ λΆλ₯λ©λλ€.
|
| 35 |
+
- **fast changing** : μμ¬μ μ¬κ±΄, μ§μ€κ³Ό κ°μ΄ λ΅λ³μ΄ κ±°μ λ³νμ§ μλ μ§λ¬Έ
|
| 36 |
+
- **slow changing** : λ΅λ³μ΄ λͺ λ
μ κ±Έμ³ λ³νλ μ§λ¬Έ
|
| 37 |
+
- **never changing** : λ΅λ³μ΄ λ³΄ν΅ 1λ
λλ κ·Έ μ΄λ΄μ λ³νλ μ§λ¬Έ
|
| 38 |
+
|
| 39 |
+
- **μ μ μ ν¨μ±**
|
| 40 |
+
- **false premise (T/F)** : μ§λ¬Έμ ν¬ν¨λ μ μ μμ²΄κ° μλͺ»λμ΄ μμΌλ©΄ True, μ μ μ λ¬Έμ κ° μμΌλ©΄ False
|
| 41 |
+
|
| 42 |
+
- **one/multi hop**
|
| 43 |
+
- λ΅λ³μ μμ±νκΈ° μν΄ νμν μΆλ‘ μ κ°μμ λ°λΌ μ§λ¬Έμ one hop, multi hopμΌλ‘ λΆλ₯ν©λλ€.
|
| 44 |
+
|
| 45 |
+
- **λλ©μΈ**
|
| 46 |
+
- λͺ¨λ μ§λ¬Έκ³Ό λλ΅μ λ€μ λλ©μΈ μ€ νλλ‘ λΆλ₯λ©λλ€.
|
| 47 |
+
- μ μΉ, μ€ν¬μΈ , μ°μ, λ μ¨, μΈκ³, κ²½μ , μ¬ν, IT/κ³Όν, μν/λ¬Έν, UNK
|
| 48 |
+
|
| 49 |
+
- **λλ¨Έμ§ λ©ν μ 보**
|
| 50 |
+
- **effective year** : μ§λ¬Έμ λ΅λ³μ΄ λ§μ§λ§μΌλ‘ λ³κ²½λ μ°λ
|
| 51 |
+
- **next review** : μμλλ λ€μ κ²ν λ μ§
|
| 52 |
+
- **source** : μ§λ¬Έ/λ΅λ³μ λν μ 보λ₯Ό μ°Ύμ μ μλ μΆμ²
|
| 53 |
+
|
| 54 |
+
<br>
|
| 55 |
+
""")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
with gr.Column(elem_classes=["leaderboard-group"]):
|
| 59 |
+
with gr.Row():
|
| 60 |
+
with gr.Column():
|
| 61 |
+
gr.Markdown("### π§ͺ DEV λ°μ΄ν°μ
(κ°λ°/κ²μ¦μ©)")
|
| 62 |
+
gr.Markdown("""
|
| 63 |
+
**Dev set**: 550μ
|
| 64 |
+
- λͺ¨λΈ κ°λ° λ° κ²μ¦μ μν΄ μ¬μ©ν μ μμ΅λλ€.
|
| 65 |
+
- μ λ΅μ λΉλ‘―νμ¬ λͺ¨λ λ©νλ°μ΄ν°κ° μ 곡λ©λλ€.
|
| 66 |
+
""")
|
| 67 |
+
|
| 68 |
+
# DEV λ°μ΄ν°μ
λ€μ΄λ‘λ λ²νΌ
|
| 69 |
+
dev_download_btn = gr.DownloadButton(
|
| 70 |
+
"πΎ DEV λ°μ΄ν°μ
λ€μ΄λ‘λ",
|
| 71 |
+
value="data/public/ko-freshqa_2025_dev.csv",
|
| 72 |
+
variant="primary",
|
| 73 |
+
size="lg"
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# DEV λ°μ΄ν°μ
미리보기
|
| 77 |
+
dev_preview = gr.DataFrame(
|
| 78 |
+
value=lambda: pd.read_csv("data/public/ko-freshqa_2025_dev.csv").head(5),
|
| 79 |
+
interactive=False,
|
| 80 |
+
label=""
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
with gr.Column():
|
| 84 |
+
gr.Markdown("### π― TEST λ°μ΄ν°μ
(μ΅μ’
νκ°μ©)")
|
| 85 |
+
gr.Markdown("""
|
| 86 |
+
**Test set**: 3,000κ°
|
| 87 |
+
- 리λ보λ μ μΆμ μν νκ°μ© λ°μ΄ν°μ
μ
λλ€.
|
| 88 |
+
- model_responseλ₯Ό μ±μμ μ μΆν΄μ£ΌμΈμ.
|
| 89 |
+
""")
|
| 90 |
+
|
| 91 |
+
# TEST λ°μ΄ν°μ
λ€μ΄λ‘λ λ²νΌ
|
| 92 |
+
test_download_btn = gr.DownloadButton(
|
| 93 |
+
"πΎ TEST λ°μ΄ν°μ
λ€μ΄λ‘λ",
|
| 94 |
+
value="data/public/ko-freshqa_2025_test.csv",
|
| 95 |
+
variant="primary",
|
| 96 |
+
size="lg"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# TEST λ°μ΄ν°μ
미리보기
|
| 100 |
+
test_preview = gr.DataFrame(
|
| 101 |
+
value=lambda: pd.read_csv("data/public/ko-freshqa_2025_test.csv").head(5),
|
| 102 |
+
interactive=False,
|
| 103 |
+
label=""
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# λ€μ΄λ‘λ μλ΄ λ©μμ§
|
| 107 |
+
gr.Markdown("""
|
| 108 |
+
<br>
|
| 109 |
+
|
| 110 |
+
### π‘ λ€μ΄λ‘λ μλ΄
|
| 111 |
+
|
| 112 |
+
- μμ λ€μ΄λ‘λ λ²νΌμ ν΄λ¦νλ©΄ λΈλΌμ°μ μμ μλμΌλ‘ νμΌ λ€μ΄λ‘λκ° μμλ©λλ€.
|
| 113 |
+
- **DEV λ°μ΄ν°μ
**μ λͺ¨λΈ κ°λ° λ° κ²μ¦οΏ½οΏ½οΏ½μΌλ‘ μ¬μ©νμΈμ.
|
| 114 |
+
- **TEST λ°μ΄ν°μ
**μ μ΅μ’
νκ° λ° λ¦¬λ보λ μ μΆμ©μΌλ‘ μ¬μ©νμΈμ.
|
| 115 |
+
- λ€μ΄λ‘λλ νμΌμ **CSV νμ**, **UTF-8 μΈμ½λ©**μΌλ‘ μ μ₯λ©λλ€.
|
| 116 |
+
|
| 117 |
+
<br>
|
| 118 |
+
""")
|
| 119 |
+
|
| 120 |
+
# License & References
|
| 121 |
+
gr.Markdown("""
|
| 122 |
+
### π License & References
|
| 123 |
+
|
| 124 |
+
- λ³Έ λ°μ΄ν°μ
μ **CC-BY-ND-NC (μ μμνμ Β· λ³κ²½ κΈμ§ Β· λΉμ리)** λΌμ΄μ μ€λ‘ μ 곡λ©λλ€.
|
| 125 |
+
- μ΄ λ¦¬λ보λλ IITPμ **βμμ±ν μΈμ΄λͺ¨λΈμ μ§μκ°λ₯μ±κ³Ό μκ°μ νλ¦μ λ°λ₯Έ μ΅μ μ± λ°μμ μν νμ΅ λ° νμ© κΈ°μ κ°λ°β** μ¬μ
μ μ§μμ λ°μ μ μλμμ΅λλ€.
|
| 126 |
+
- μ΄ μμ€ν
μ FreshLLMs νλ‘μ νΈμ **FreshQA λ°μ΄ν°μ
κ³Ό νκ° λ°©λ²λ‘ **μ κΈ°λ°μΌλ‘ ꡬμΆλμμ΅λλ€.
|
| 127 |
+
- μλ³Έ FreshQAλ λ§ν¬λ₯Ό μ°Έκ³ ν΄ μ£ΌμΈμ. π https://github.com/freshllms/freshqa
|
| 128 |
+
""")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
gr.Markdown("""
|
| 132 |
+
```
|
| 133 |
+
@misc{vu2023freshllms,
|
| 134 |
+
title={FreshLLMs: Refreshing Large Language Models with Search Engine Augmentation},
|
| 135 |
+
author={Tu Vu and Mohit Iyyer and Xuezhi Wang and Noah Constant and Jerry Wei and Jason Wei and Chris Tar and Yun-Hsuan Sung and Denny Zhou and Quoc Le and Thang Luong},
|
| 136 |
+
year={2023},
|
| 137 |
+
eprint={2310.03214},
|
| 138 |
+
archivePrefix={arXiv},
|
| 139 |
+
primaryClass={cs.CL}
|
| 140 |
+
}
|
| 141 |
+
```
|
| 142 |
+
""")
|
ui/leaderboard_tab.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
리λ보λ ν UI μ»΄ν¬λνΈ
|
| 3 |
+
|
| 4 |
+
π Leaderboard νμ UIμ λ‘μ§μ κ΄λ¦¬ν©λλ€.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from src.leaderboard_manager import load_leaderboard_data, prepare_display_data
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def create_leaderboard_tab():
|
| 13 |
+
"""리λ보λ ν UI μμ±"""
|
| 14 |
+
|
| 15 |
+
# μ΅μλ¨ ν΅ν© κ²μ λ° - κ°μ λ λμμΈ
|
| 16 |
+
with gr.Row():
|
| 17 |
+
with gr.Column(scale=12):
|
| 18 |
+
search_input = gr.Textbox(
|
| 19 |
+
label="μ μΆμ μ΄λ¦ κ²μ",
|
| 20 |
+
placeholder="π μ μΆμ μ΄λ¦μΌλ‘ κ²μ...",
|
| 21 |
+
value="",
|
| 22 |
+
container=False,
|
| 23 |
+
elem_classes=["search-input"]
|
| 24 |
+
)
|
| 25 |
+
with gr.Column(scale=1, min_width=100):
|
| 26 |
+
clear_search_btn = gr.Button(
|
| 27 |
+
"ποΈ μ΄κΈ°ν",
|
| 28 |
+
variant="secondary",
|
| 29 |
+
size="sm",
|
| 30 |
+
elem_classes=["clear-search-btn"]
|
| 31 |
+
)
|
| 32 |
+
with gr.Column(scale=1, min_width=100):
|
| 33 |
+
refresh_btn = gr.Button(
|
| 34 |
+
"π μλ‘κ³ μΉ¨",
|
| 35 |
+
variant="primary",
|
| 36 |
+
size="sm",
|
| 37 |
+
elem_classes=["refresh-btn"]
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# 리λ보λ λ
ΈμΆ μ»¬λΌ λ° νμλͺ
μ€μ
|
| 41 |
+
DISPLAY_COLUMNS = [
|
| 42 |
+
'rank',
|
| 43 |
+
'id',
|
| 44 |
+
'model',
|
| 45 |
+
'description',
|
| 46 |
+
'accuracy',
|
| 47 |
+
'fast_changing_accuracy',
|
| 48 |
+
'slow_changing_accuracy',
|
| 49 |
+
'never_changing_accuracy',
|
| 50 |
+
'acc_vp',
|
| 51 |
+
'acc_fp',
|
| 52 |
+
'acc_vp_one_hop',
|
| 53 |
+
'acc_vp_two_hop',
|
| 54 |
+
'acc_fp_one_hop',
|
| 55 |
+
'acc_fp_two_hop',
|
| 56 |
+
'acc_politics',
|
| 57 |
+
'acc_sports',
|
| 58 |
+
'acc_entertainment',
|
| 59 |
+
'acc_weather',
|
| 60 |
+
'acc_world',
|
| 61 |
+
'acc_economy',
|
| 62 |
+
'acc_society',
|
| 63 |
+
'acc_it_science',
|
| 64 |
+
'acc_life_culture',
|
| 65 |
+
'acc_unknown'
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
COLUMN_LABELS = {
|
| 69 |
+
'rank': 'Rank',
|
| 70 |
+
'id': 'ID',
|
| 71 |
+
'model': 'Model',
|
| 72 |
+
'description': 'Description',
|
| 73 |
+
'accuracy': 'Accuracy',
|
| 74 |
+
'fast_changing_accuracy': 'Fast-changing',
|
| 75 |
+
'slow_changing_accuracy': 'Slow-changing',
|
| 76 |
+
'never_changing_accuracy': 'Never-changing',
|
| 77 |
+
'acc_vp': 'Valid Premise',
|
| 78 |
+
'acc_fp': 'False Premise',
|
| 79 |
+
'acc_vp_one_hop': 'VP One-hop',
|
| 80 |
+
'acc_vp_two_hop': 'VP Multi-hop',
|
| 81 |
+
'acc_fp_one_hop': 'FP One-hop',
|
| 82 |
+
'acc_fp_two_hop': 'FP Multi-hop',
|
| 83 |
+
'acc_politics': 'Politics',
|
| 84 |
+
'acc_sports': 'Sports',
|
| 85 |
+
'acc_entertainment': 'Entertainment',
|
| 86 |
+
'acc_weather': 'Weather',
|
| 87 |
+
'acc_world': 'World',
|
| 88 |
+
'acc_economy': 'Economy',
|
| 89 |
+
'acc_society': 'Society',
|
| 90 |
+
'acc_it_science': 'IT/Science',
|
| 91 |
+
'acc_life_culture': 'Life/Culture',
|
| 92 |
+
'acc_unknown': 'Unknown'
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
def format_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
|
| 96 |
+
"""리λ보λμ λ
ΈμΆν μ»¬λΌ μ ν λ° ν€λλͺ
λ³ν"""
|
| 97 |
+
if df.empty:
|
| 98 |
+
# λΉ DataFrameμΌ λλ μ»¬λΌ κ΅¬μ‘°λ₯Ό μ μ§νκΈ° μν΄ λΉ DataFrame μμ±
|
| 99 |
+
empty_df = pd.DataFrame(columns=DISPLAY_COLUMNS)
|
| 100 |
+
rename_map = {col: COLUMN_LABELS[col] for col in DISPLAY_COLUMNS if col in COLUMN_LABELS}
|
| 101 |
+
return empty_df.rename(columns=rename_map)
|
| 102 |
+
|
| 103 |
+
selected_columns = [col for col in DISPLAY_COLUMNS if col in df.columns]
|
| 104 |
+
formatted_df = df[selected_columns].copy()
|
| 105 |
+
rename_map = {col: COLUMN_LABELS[col] for col in selected_columns if col in COLUMN_LABELS}
|
| 106 |
+
return formatted_df.rename(columns=rename_map)
|
| 107 |
+
|
| 108 |
+
def build_leaderboard_state(source_df: pd.DataFrame):
|
| 109 |
+
"""리λ보λ νμμ© Relaxed/Strict λ°μ΄ν°μ λΉ μν μ¬λΆ λ°ν"""
|
| 110 |
+
if source_df is None:
|
| 111 |
+
source_df = pd.DataFrame()
|
| 112 |
+
|
| 113 |
+
if source_df.empty or 'evaluation_mode' not in source_df.columns:
|
| 114 |
+
relaxed_df = pd.DataFrame()
|
| 115 |
+
strict_df = pd.DataFrame()
|
| 116 |
+
else:
|
| 117 |
+
relaxed_df = source_df.query("evaluation_mode == 'Relaxed'")
|
| 118 |
+
strict_df = source_df.query("evaluation_mode == 'Strict'")
|
| 119 |
+
|
| 120 |
+
formatted_relaxed = format_leaderboard(prepare_display_data(relaxed_df))
|
| 121 |
+
formatted_strict = format_leaderboard(prepare_display_data(strict_df))
|
| 122 |
+
is_empty = relaxed_df.empty and strict_df.empty
|
| 123 |
+
return formatted_relaxed, formatted_strict, is_empty
|
| 124 |
+
|
| 125 |
+
leaderboard_data = load_leaderboard_data()
|
| 126 |
+
relaxed_initial, strict_initial, is_initial_empty = build_leaderboard_state(leaderboard_data)
|
| 127 |
+
|
| 128 |
+
# Relaxed λͺ¨λ 리λ보λ
|
| 129 |
+
with gr.Column(elem_classes=["leaderboard-group"]):
|
| 130 |
+
gr.Markdown(
|
| 131 |
+
"### π’ Relaxed Evaluation"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
relaxed_leaderboard_table = gr.DataFrame(
|
| 135 |
+
value=relaxed_initial,
|
| 136 |
+
interactive=False,
|
| 137 |
+
wrap=False,
|
| 138 |
+
show_label=False,
|
| 139 |
+
elem_classes=["leaderboard-table"]
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Strict λͺ¨λ 리λ보λ
|
| 143 |
+
with gr.Column(elem_classes=["leaderboard-group"]):
|
| 144 |
+
gr.Markdown(
|
| 145 |
+
"### π΄ Strict Evaluation"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
strict_leaderboard_table = gr.DataFrame(
|
| 149 |
+
value=strict_initial,
|
| 150 |
+
interactive=False,
|
| 151 |
+
wrap=False,
|
| 152 |
+
show_label=False,
|
| 153 |
+
elem_classes=["leaderboard-table"]
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# 리λ보λ κ΄λ ¨ μ€λͺ
|
| 157 |
+
with gr.Column(elem_classes=["leaderboard-group"]):
|
| 158 |
+
gr.Markdown("""
|
| 159 |
+
μ΄ λ¦¬λ보λλ [FreshQA](https://github.com/freshllms/freshqa)μμ μκ°μ λ°μ λ§λ€μ΄μ‘μ΅λλ€.
|
| 160 |
+
fact type(fast changing, slow changing, never changing), μ μ μ μ§μ€μ±,
|
| 161 |
+
10κ°μ λλ©μΈμ λ°λΌ λλλ μ§λ¬Έλ€μ ν΅ν΄ νκ΅μ΄ μ§μκ³Ό κ΄λ ¨λ LLMμ μ΅μ μ±μ νλ¨ν μ μμ΅λλ€.
|
| 162 |
+
|
| 163 |
+
μ΄ λ¦¬λ보λλ IITPμ **βμμ±ν μΈμ΄λͺ¨λΈμ μ§μκ°λ₯μ±κ³Ό μκ°μ νλ¦μ λ°λ₯Έ μ΅μ μ± λ°μμ μν νμ΅ λ° νμ© κΈ°μ κ°λ°β** μ¬μ
μ μ§μμ λ°μ μ μλμμ΅λλ€.
|
| 164 |
+
|
| 165 |
+
κ²°κ³Όμ 무결μ±Β·μ ν¨μ±μ μ μ§νκ³ **μμ μ‘°μμ λ°©μ§**νκΈ° μν΄ νκ° λ°μ΄ν°μ
μ μ λ΅μ κΈ°λ°λ‘ μ μ§λ©λλ€.
|
| 166 |
+
""")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ν΅ν© κ²μ νν° ν¨μ (Relaxedμ Strict λͺ¨λ λͺ¨λ νν°λ§)
|
| 171 |
+
def filter_leaderboard_data(search_text):
|
| 172 |
+
"""Relaxedμ Strict λͺ¨λ 리λ보λ λ°μ΄ν° νν°λ§ (CSV κΈ°λ°)"""
|
| 173 |
+
try:
|
| 174 |
+
# CSVμμ μ 체 λ°μ΄ν° λ‘λ
|
| 175 |
+
all_df = load_leaderboard_data()
|
| 176 |
+
|
| 177 |
+
# κ²μ νν° μ μ© (μ μΆμ μ λ³΄λ§ κ²μ)
|
| 178 |
+
if search_text.strip() and 'id' in all_df.columns:
|
| 179 |
+
mask = all_df['id'].str.contains(search_text, case=False, na=False)
|
| 180 |
+
filtered_df = all_df[mask]
|
| 181 |
+
else:
|
| 182 |
+
filtered_df = all_df
|
| 183 |
+
|
| 184 |
+
formatted_relaxed, formatted_strict, _ = build_leaderboard_state(filtered_df)
|
| 185 |
+
return formatted_relaxed, formatted_strict
|
| 186 |
+
except Exception as e:
|
| 187 |
+
print(f"β 리λ보λ λ°μ΄ν° νν°λ§ μ€ν¨: {e}")
|
| 188 |
+
empty = pd.DataFrame()
|
| 189 |
+
return empty, empty
|
| 190 |
+
|
| 191 |
+
# κ²μ μ΄λ²€νΈ μ°κ²°
|
| 192 |
+
search_input.change(
|
| 193 |
+
fn=filter_leaderboard_data,
|
| 194 |
+
inputs=[search_input],
|
| 195 |
+
outputs=[relaxed_leaderboard_table, strict_leaderboard_table]
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# κ²μ μ΄κΈ°ν λ²νΌ
|
| 199 |
+
def clear_search():
|
| 200 |
+
try:
|
| 201 |
+
all_df = load_leaderboard_data()
|
| 202 |
+
formatted_relaxed, formatted_strict, _ = build_leaderboard_state(all_df)
|
| 203 |
+
return "", formatted_relaxed, formatted_strict
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"β 리λ보λ λ°μ΄ν° λ‘λ μ€ν¨: {e}")
|
| 206 |
+
empty = pd.DataFrame()
|
| 207 |
+
return "", empty, empty
|
| 208 |
+
|
| 209 |
+
clear_search_btn.click(
|
| 210 |
+
fn=clear_search,
|
| 211 |
+
outputs=[search_input, relaxed_leaderboard_table, strict_leaderboard_table]
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# μλ‘κ³ μΉ¨ λ²νΌ
|
| 215 |
+
def refresh_leaderboard():
|
| 216 |
+
try:
|
| 217 |
+
all_df = load_leaderboard_data()
|
| 218 |
+
formatted_relaxed, formatted_strict, is_empty = build_leaderboard_state(all_df)
|
| 219 |
+
|
| 220 |
+
return formatted_relaxed, formatted_strict
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"β 리λ보λ μλ‘κ³ μΉ¨ μ€ν¨: {e}")
|
| 223 |
+
empty = pd.DataFrame()
|
| 224 |
+
return empty, empty
|
| 225 |
+
|
| 226 |
+
refresh_btn.click(
|
| 227 |
+
fn=refresh_leaderboard,
|
| 228 |
+
outputs=[relaxed_leaderboard_table, strict_leaderboard_table]
|
| 229 |
+
)
|
ui/styles.css
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* νμ μΈλ‘ μ€ν¬λ‘€λ°λ₯Ό νμν΄μ νμ΄ λ°λμ§ μκ² νκΈ° */
|
| 2 |
+
html {
|
| 3 |
+
overflow-y: scroll;
|
| 4 |
+
}
|
| 5 |
+
|
| 6 |
+
/* ================================
|
| 7 |
+
κΈ°λ³Έ 컨ν
μ΄λ μ€νμΌ (μ 체 ν ν΅μΌ)
|
| 8 |
+
================================ */
|
| 9 |
+
.gradio-container,
|
| 10 |
+
.main {
|
| 11 |
+
max-width: 1400px !important; /* νλ©΄ λ무 λμ΄μ§μ§ μκ² μ λΉν λμ κ³ μ ν */
|
| 12 |
+
width: 100% !important;
|
| 13 |
+
margin: 0 auto !important; /* νμ κ°μ΄λ° μ λ ¬ */
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.fixed-list * {
|
| 17 |
+
font-size: 15px !important;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
/* ================================
|
| 22 |
+
λ°μ΄ν°νλ μ κΈ°λ³Έ μ€νμΌ
|
| 23 |
+
================================ */
|
| 24 |
+
.dataframe {
|
| 25 |
+
font-size: 16px !important;
|
| 26 |
+
width: 100% !important;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
.dataframe table {
|
| 30 |
+
font-size: 16px !important;
|
| 31 |
+
width: 100% !important;
|
| 32 |
+
table-layout: auto !important;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
.dataframe th {
|
| 36 |
+
font-size: 18px !important;
|
| 37 |
+
font-weight: bold !important;
|
| 38 |
+
padding: 12px !important;
|
| 39 |
+
white-space: nowrap !important;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.dataframe td {
|
| 43 |
+
font-size: 16px !important;
|
| 44 |
+
padding: 10px !important;
|
| 45 |
+
white-space: nowrap !important;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
/* ================================
|
| 50 |
+
리λ보λ κ²μ λ° μ€νμΌ
|
| 51 |
+
================================ */
|
| 52 |
+
.search-input input {
|
| 53 |
+
font-size: 16px !important;
|
| 54 |
+
padding: 12px 16px !important;
|
| 55 |
+
border-radius: 8px !important;
|
| 56 |
+
border: 2px solid #e0e0e0 !important;
|
| 57 |
+
transition: border-color 0.3s ease !important;
|
| 58 |
+
|
| 59 |
+
/* πΉ input λμ΄ κ³ μ */
|
| 60 |
+
height: 40px !important;
|
| 61 |
+
box-sizing: border-box !important;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.search-input input:focus {
|
| 65 |
+
border-color: #4a90e2 !important;
|
| 66 |
+
outline: none !important;
|
| 67 |
+
box-shadow: 0 0 0 3px rgba(74, 144, 226, 0.1) !important;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
/* κ²μ μμ wrapper */
|
| 71 |
+
.search-input {
|
| 72 |
+
margin: 8px 0 12px 0 !important;
|
| 73 |
+
display: block;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.search-input input {
|
| 77 |
+
margin: 0 !important;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* ================================
|
| 81 |
+
λ²νΌ μ€νμΌ (κΈ°λ³Έ μ€νμΌ μ μ§)
|
| 82 |
+
================================ */
|
| 83 |
+
.clear-search-btn,
|
| 84 |
+
.refresh-btn {
|
| 85 |
+
border-radius: 8px !important;
|
| 86 |
+
font-weight: 500 !important;
|
| 87 |
+
|
| 88 |
+
/* πΉ κ²μλ°μ λμΌν μΈλ‘ λμ΄λ‘ λ§μΆ€ */
|
| 89 |
+
height: 40px !important;
|
| 90 |
+
padding: 0 16px !important;
|
| 91 |
+
|
| 92 |
+
/* κΈ°μ‘΄ μ€νμΌ μ΅λν μ μ§ */
|
| 93 |
+
margin-top: 4px !important;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
/* ================================
|
| 99 |
+
리λ보λ κ·Έλ£Ή/ν
μ΄λΈ μ¬λ°± λ° μΉ΄λ μ€νμΌ
|
| 100 |
+
================================ */
|
| 101 |
+
.leaderboard-group {
|
| 102 |
+
margin: 18px 0 28px 0 !important;
|
| 103 |
+
padding: 12px 14px !important;
|
| 104 |
+
border: 1px solid #eee;
|
| 105 |
+
border-radius: 12px;
|
| 106 |
+
background: #ffffff;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.leaderboard-table {
|
| 110 |
+
margin-top: 8px !important;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
/* ν μ
μ¬λ°± λ³΄κ° (리λ보λ μ μ©) */
|
| 114 |
+
.leaderboard-table .dataframe th {
|
| 115 |
+
padding: 12px 14px !important;
|
| 116 |
+
}
|
| 117 |
+
.leaderboard-table .dataframe td {
|
| 118 |
+
padding: 10px 14px !important;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
/* ================================
|
| 122 |
+
μ μΆ μν(Textbox) μ€ν¬λ‘€ λ° μ€λ°κΏ κ°μ
|
| 123 |
+
================================ */
|
| 124 |
+
.submission-status textarea {
|
| 125 |
+
max-height: 420px !important;
|
| 126 |
+
overflow-y: auto !important;
|
| 127 |
+
white-space: pre-wrap !important; /* κ°ν μ μ§ */
|
| 128 |
+
word-break: break-word !important; /* κΈ΄ λ¨μ΄/ν νμ£Ό λ°©μ§ */
|
| 129 |
+
text-align: left !important; /* μ’μΈ‘ μ λ ¬ κ°μ */
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/* μ€μ²© μ€ν¬λ‘€ λ°©μ§: λνΌλ μ€ν¬λ‘€ ν΄μ */
|
| 133 |
+
.submission-status, .submission-status .wrap {
|
| 134 |
+
max-height: none !important;
|
| 135 |
+
overflow: visible !important;
|
| 136 |
+
}
|
ui/submission_tab.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
μ μΆ λ° νκ° ν UI μ»΄ν¬λνΈ
|
| 3 |
+
|
| 4 |
+
π€ μ μΆ λ° νκ° νμ UIμ λ‘μ§μ κ΄λ¦¬ν©λλ€.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from src.submission_handler import process_submission
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def create_submission_tab():
|
| 12 |
+
"""μ μΆ λ° νκ° ν UI μμ±"""
|
| 13 |
+
|
| 14 |
+
gr.Markdown("""
|
| 15 |
+
### π μ μΆ λ°©λ²
|
| 16 |
+
- λ°μ΄ν°μ
νμ ν΅νμ¬ test set λ€μ΄λ‘λ
|
| 17 |
+
- κ° questionμ λν model_response μμ±
|
| 18 |
+
- model_responseκ° μ±μμ§ CSV νμΌ μ
λ‘λ(UTF-8 μΈμ½λ©)
|
| 19 |
+
- μ μΆμ μ΄λ¦(id), μ¬μ© λͺ¨λΈ, μ€λͺ
μμ±νμ¬ μ μΆ
|
| 20 |
+
- μ€λͺ
: λ°λ‘ μ μ©ν λ°©λ²λ‘ μ΄ μμΌλ©΄ μμ±ν΄ μ£ΌμΈμ. 곡λμΌ κ²½μ° μ¬μ© λͺ¨λΈμ λ² μ΄μ€ μ±λ₯μΌλ‘ κ°μ£Όν©λλ€.
|
| 21 |
+
- μ¬μ© λͺ¨λΈ: responseλ₯Ό μμ±νκΈ° μν λͺ¨λΈμ μμ λ‘κ² μ νν μ μμ΅λλ€. μ¬μ©ν λͺ¨λΈμ **곡μ λͺ
μΉ**μ μμ±ν΄ μ£ΌμΈμ.
|
| 22 |
+
|
| 23 |
+
<br>
|
| 24 |
+
|
| 25 |
+
### π νκ° λ°©μ
|
| 26 |
+
- νκ°λ upstageμ μ΅μ **solar λͺ¨λΈ**λ‘ μ§νλ©λλ€. *(2025-11-11 κΈ°μ€: solar-pro2-250909)*
|
| 27 |
+
- νκ° κ²°κ³Όλ μ 체 accuracyλΏ μλλΌ fact type, μ μ μ ν¨μ±, number of hop, λλ©μΈλ³ λΆλ₯ μ μλ μ 곡ν©λλ€.
|
| 28 |
+
- ν λ²μ μ μΆλ‘ **relaxed evaluation**κ³Ό **strict evaluation**μ΄ λμμ μ§νλ©λλ€.
|
| 29 |
+
|
| 30 |
+
##### πΉ relaxed evaluation
|
| 31 |
+
- λ΅λ³μ΄ κ°μ§ μ£Όμ μ 보μ μ νμ±μλ§ μ΄μ μ λ§μΆ° νκ°ν©λλ€.
|
| 32 |
+
- νκ°μ΄λ μ€λλ μ λ³΄κ° ν¬ν¨λμ΄ μμ΄λ, μ£Όμ μ 보μ μν₯μ λ―ΈμΉμ§ μμΌλ©΄ μ λ΅μΌλ‘ μΈμ λ μ μμ΅λλ€.
|
| 33 |
+
- λ΅λ³ νμμ΄ μλͺ»λ κ²½μ°(μ: λ€λ₯Έ μΈμ΄λ‘ λ΅λ³)λ νμ©λ©λλ€.
|
| 34 |
+
|
| 35 |
+
##### πΉ strict evaluation
|
| 36 |
+
- μ£Όμ μ 보μ μ νμ±λΏ μλλΌ, λͺ¨λ μ¬μ€μ΄ μ ννκ³ μ΅μ μ΄μ΄μΌ ν©λλ€.
|
| 37 |
+
- μ¬μν νκ°μ΄λΌλ ν¬ν¨λλ©΄ μ λ΅μΌλ‘ μΈμ λμ§ μμ΅λλ€.
|
| 38 |
+
- βμ μ§μμ 2021λ
9μκΉμ§μ
λλ€β¦β κ°μ μ€λλ μ 보 κ²½κ³ λ¬Έκ΅¬λ κ·Έ λ΄μ©μ΄ λ³κ²½λμ§ μμμμ΄ λͺ
νν κ²½μ°μλ§ μ λ΅μΌλ‘ μΈμ λ©λλ€.
|
| 39 |
+
|
| 40 |
+
<br>
|
| 41 |
+
|
| 42 |
+
### π« μ μΆ μ ν
|
| 43 |
+
- μ¬μ©μλΉ **ν루 μ΅λ 3ν μ μΆ** κ°λ₯ν©λλ€.
|
| 44 |
+
- μ€ν¨ν μ μΆμ μΉ΄μ΄νΈλμ§ μμ΅λλ€.
|
| 45 |
+
- μ μΆ νμλ **λ§€μΌ νκ΅ μκ° 00μ 00λΆ**μ μ΄κΈ°νλ©λλ€.
|
| 46 |
+
|
| 47 |
+
<br>
|
| 48 |
+
|
| 49 |
+
### β±οΈ νκ° μμ μκ°
|
| 50 |
+
- νκ° μμ μκ°μ **μ μΆλΉ μ½ 30λΆ**μΌλ‘ μμλ©λλ€.
|
| 51 |
+
- λμμ μ μΆν μ°Έκ°μκ° λ§μ κ²½μ° μκ°μ΄ μ¦κ°ν μ μμ΅λλ€.
|
| 52 |
+
|
| 53 |
+
<br>
|
| 54 |
+
""")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
submission_file = gr.File(
|
| 59 |
+
label="μ λ΅μ΄ ν¬ν¨λ CSV νμΌ μ
λ‘λ",
|
| 60 |
+
file_types=['.csv']
|
| 61 |
+
)
|
| 62 |
+
submitter_name = gr.Textbox(
|
| 63 |
+
label="μ μΆμ μ΄λ¦",
|
| 64 |
+
placeholder="μ: AI Ambassador",
|
| 65 |
+
value="Anonymous"
|
| 66 |
+
)
|
| 67 |
+
submit_model = gr.Textbox(
|
| 68 |
+
label="μ¬μ©ν λͺ¨λΈ",
|
| 69 |
+
placeholder="μ¬μ©ν λͺ¨λΈμ 곡μ λͺ
μΉμ μμ±ν΄ μ£ΌμΈμ.",
|
| 70 |
+
value="Anonymous Model"
|
| 71 |
+
)
|
| 72 |
+
submit_description = gr.Textbox(
|
| 73 |
+
label="μ€λͺ
",
|
| 74 |
+
placeholder="λ°λ‘ μ μ©ν λ°©λ²λ‘ μ΄ μμΌλ©΄ μμ±ν΄ μ£ΌμΈμ."
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# μ μΆ λ° μ·¨μ λ²νΌ
|
| 78 |
+
submit_btn = gr.Button(
|
| 79 |
+
"π μ μΆ λ° νκ° μμ",
|
| 80 |
+
variant="primary"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# μ μΆ μν λ° κ²°κ³Ό ν
μ€νΈ
|
| 84 |
+
submission_status = gr.Textbox(
|
| 85 |
+
label="μ μΆ μν",
|
| 86 |
+
value="CSV νμΌμ μ
λ‘λνκ³ μ μΆνμΈμ.",
|
| 87 |
+
interactive=False,
|
| 88 |
+
lines=20,
|
| 89 |
+
elem_classes=["submission-status"]
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# μ μΆ λ²νΌ μ΄λ²€νΈ μ°κ²°
|
| 93 |
+
submit_btn.click(
|
| 94 |
+
fn=process_submission,
|
| 95 |
+
inputs=[submission_file, submitter_name, submit_model, submit_description],
|
| 96 |
+
outputs=[submission_status],
|
| 97 |
+
concurrency_limit=3
|
| 98 |
+
)
|