jisubae commited on
Commit
5e8f045
Β·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ .env
26
+ .venv
27
+ env/
28
+ venv/
29
+ ENV/
30
+ env.bak/
31
+ venv.bak/
32
+
33
+ # IDE
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+
40
+ # OS
41
+ .DS_Store
42
+ .DS_Store?
43
+ ._*
44
+ .Spotlight-V100
45
+ .Trashes
46
+ ehthumbs.db
47
+ Thumbs.db
48
+
49
+ # Project specific - Test and temporary files (but keep leaderboard_results.csv)
50
+ # leaderboard_results.csv # 주석 μ²˜λ¦¬ν•˜μ—¬ Git 좔적 ν—ˆμš©
51
+ evaluation_results.json
52
+ test_model_responses.csv
53
+ *.log
54
+
55
+ # Private data (λ―Όκ°ν•œ 데이터)
56
+ #data/private/
57
+
58
+ # Jupyter Notebook
59
+ .ipynb_checkpoints
60
+
61
+ # pyenv
62
+ .python-version
63
+
64
+ # pipenv
65
+ Pipfile.lock
66
+
67
+ # pytest
68
+ .pytest_cache/
69
+ .coverage
70
+ htmlcov/
71
+
72
+ # mypy
73
+ .mypy_cache/
74
+ .dmypy.json
75
+ dmypy.json
76
+
77
+ # Gradio temporary files
78
+ gradio_cached_examples/
79
+ flagged/
80
+
81
+ # Lock files for file locking mechanism
82
+ *.lock
83
+
84
+ # MacOS specific
85
+ .AppleDouble
86
+ .LSOverride
87
+ Icon
88
+
89
+ # Thumbnails
90
+ ._*
91
+
92
+ # Files that might appear in the root of a volume
93
+ .DocumentRevisions-V100
94
+ .fseventsd
95
+ .TemporaryItems
96
+ .VolumeIcon.icns
97
+ .com.apple.timemachine.donotpresent
98
+
99
+ # Directories potentially created on remote AFP share
100
+ .AppleDB
101
+ .AppleDesktop
102
+ Network Trash Folder
103
+ Temporary Items
104
+ .apdisk
105
+
106
+ # Setup artifacts and temporary files
107
+ =*.*
108
+ fix_setup.sh
109
+ install_dependencies.sh
110
+ step_by_step.sh
111
+ simple_test.py
112
+
113
+ # Data
114
+ freshqa/freshqa_prompt.py
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # μ‹œμŠ€ν…œ νŒ¨ν‚€μ§€ μ—…λ°μ΄νŠΈ 및 νƒ€μž„μ‘΄ μ„€μ •
6
+ RUN apt-get update && apt-get install -y \
7
+ git \
8
+ curl \
9
+ tzdata \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # νƒ€μž„μ‘΄μ„ Asia/Seoul둜 μ„€μ •
13
+ ENV TZ=Asia/Seoul
14
+ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
15
+
16
+ # Python μ˜μ‘΄μ„± μ„€μΉ˜
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir --upgrade pip
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ 파일 볡사
22
+ COPY . .
23
+
24
+ # κΆŒν•œ μ„€μ •
25
+ RUN chmod +x quick_start.sh
26
+
27
+ # ν™˜κ²½λ³€μˆ˜ μ„€μ •
28
+ ENV PYTHONPATH=/app
29
+ ENV GRADIO_SERVER_NAME=0.0.0.0
30
+ ENV GRADIO_SERVER_PORT=7860
31
+
32
+ # 포트 λ…ΈμΆœ
33
+ EXPOSE 7860
34
+
35
+ # κΈ°λ³Έ λͺ…λ Ήμ–΄
36
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Ko-FreshQA Leaderboard
3
+ emoji: πŸš€
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: apache-2.0
10
+ ---
11
+
12
+ ## Ko-FreshQA Leaderboard
13
+ ν•œκ΅­μ–΄ FreshQA 기반 μžλ™ 평가/λ¦¬λ”λ³΄λ“œ μ‹œμŠ€ν…œμž…λ‹ˆλ‹€. μ°Έκ°€μžκ°€ μ—…λ‘œλ“œν•œ CSV의 `model_response`λ₯Ό κΈ°μ€€ 데이터와 λ§€μΉ­ν•˜κ³ , Upstage Solar λͺ¨λΈλ‘œ Relaxed/Strict 평가λ₯Ό μˆ˜ν–‰ν•œ λ’€ κ²°κ³Όλ₯Ό λ¦¬λ”λ³΄λ“œμ— λ°˜μ˜ν•©λ‹ˆλ‹€. Gradio UI둜 μ‹€ν–‰λ©λ‹ˆλ‹€.
14
+
15
+ ### 핡심 κΈ°λŠ₯
16
+ - 데이터셋 배포: DEV/TEST CSV λ‹€μš΄λ‘œλ“œ νƒ­ 제곡
17
+ - 제좜 및 μžλ™ 평가: μ—…λ‘œλ“œλœ CSVλ₯Ό 병합 β†’ 평가 β†’ μ§€ν‘œ 집계 β†’ λ¦¬λ”λ³΄λ“œ 반영
18
+ - 상세 μ§€ν‘œ: fact type, μ „μ œ μœ νš¨μ„±(vp/fp), hop(one/multi), 연도(old/new), 도메인별 정확도
19
+ - 제좜 μ œν•œ(μ˜΅μ…˜): μ‚¬μš©μžλ³„ ν•˜λ£¨ 3회 μ œν•œ κΈ°λŠ₯ (Hugging Face μ €μž₯μ†Œ 기반)
20
+
21
+ ---
22
+
23
+ ## 디렉터리 ꡬ쑰
24
+ - `app.py`: Gradio μ•± μ΄ˆκΈ°ν™” 및 νƒ­ ꡬ성
25
+ - `config.py`: ν™˜κ²½λ³€μˆ˜ λ‘œλ“œ 및 ν•„μˆ˜ μ„€μ • 검증
26
+ - `freshqa/`
27
+ - `fresheval.py`: 단일 μƒ˜ν”Œ 평가 둜직
28
+ - `fresheval_parallel.py`: λ°μ΄ν„°ν”„λ ˆμž„ 병렬 평가 래퍼
29
+ - `freshqa_acc.py`: 평가 κ²°κ³Ό 집계(정확도 계산 및 도메인별 톡계)
30
+ - `merge_csv_with_model_response.py`: κΈ°μ€€ 데이터와 μ‚¬μš©μž CSV 병합
31
+ - `src/`
32
+ - `submission_handler.py`: μ œμΆœλΆ€ν„° λ¦¬λ”λ³΄λ“œ λ°˜μ˜κΉŒμ§€ 전체 μ˜€μΌ€μŠ€νŠΈλ ˆμ΄μ…˜
33
+ - `submission_tracker.py`: 제좜 이λ ₯ 좔적(HF repo 기반, μ˜΅μ…˜)
34
+ - `leaderboard_manager.py`: λ¦¬λ”λ³΄λ“œ CSV λ‘œλ“œ/μ €μž₯/ν‘œμ‹œμš© 정리
35
+ - `quick_csv_loader.py`, `hf_private_csv_loader.py`: HF Private repoμ—μ„œ CSV λ‘œλ“œ μœ ν‹Έ
36
+ - `api_key_rotator.py`, `utils.py`: μœ ν‹Έλ¦¬ν‹°
37
+ - `ui/`
38
+ - `leaderboard_tab.py`, `submission_tab.py`, `dataset_tab.py`, `styles.css`
39
+ - `data/leaderboard_results.csv`: λ¦¬λ”λ³΄λ“œ λˆ„μ  데이터
40
+
41
+ ---
42
+
43
+ ## μš”κ΅¬ 사항
44
+ - Python 3.10
45
+ - Upstage API ν‚€(단일 λ˜λŠ” 닀쀑)
46
+ - Hugging Face 토큰(HF Private repo μ ‘κ·Όμš©)
47
+ - Hugging Face Dataset repo
48
+ - κΈ°μ€€ 데이터: `FRESHQA_DATA_REPO_ID` / `FRESHQA_DATA_FILENAME`
49
+ - (μ˜΅μ…˜) 제좜 좔적 μ €μž₯μ†Œ: `SUBMISSION_TRACKER_REPO_ID`
50
+
51
+ μ„€μΉ˜:
52
+ ```bash
53
+ python -m venv venv && source venv/bin/activate
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+ λ˜λŠ” Conda:
58
+ ```bash
59
+ conda env create -f environment.yml
60
+ conda activate freshqa-leaderboard
61
+ ```
62
+
63
+ ---
64
+
65
+ ## ν™˜κ²½ λ³€μˆ˜(.env)
66
+ `env.example`λ₯Ό `.env`둜 볡사 ν›„ κ°’ μ±„μš°κΈ°:
67
+ ```bash
68
+ cp env.example .env
69
+ ```
70
+
71
+ ν•„μˆ˜/μ£Όμš” λ³€μˆ˜
72
+ - HF_TOKEN
73
+ - FRESHQA_DATA_REPO_ID
74
+ - FRESHQA_DATA_FILENAME (κΈ°λ³Έκ°’: ko-freshqa_2025_total.csv)
75
+ - UPSTAGE_API_KEY λ˜λŠ” UPSTAGE_API_KEYS(콀마 ꡬ뢄)
76
+ - ENABLE_SUBMISSION_LIMIT (κΈ°λ³Έ: true)
77
+ - SUBMISSION_TRACKER_REPO_ID (제좜 μ œν•œ μ‚¬μš© μ‹œ ν•„μš”)
78
+
79
+ 검증: μ•± μ‹œμž‘ μ‹œ `Config.validate_required_configs()`κ°€ λˆ„λ½λœ ν•„μˆ˜ 섀정을 κ²€μ‚¬ν•©λ‹ˆλ‹€.
80
+
81
+ ---
82
+
83
+ ## μ‹€ν–‰
84
+ 둜컬:
85
+ ```bash
86
+ python app.py
87
+ ```
88
+ 기본 포트: 7860
89
+
90
+ Hugging Face Spaces:
91
+ - ν™˜κ²½λ³€μˆ˜ `SPACE_ID`κ°€ μ‘΄μž¬ν•˜λ©΄ Spaces λͺ¨λ“œλ‘œ λ™μž‘ν•©λ‹ˆλ‹€.
92
+
93
+ Docker(μ˜΅μ…˜):
94
+ - `Dockerfile`, `docker-compose.yml` 제곡 (ν•„μš” μ‹œ 섀정에 맞게 μˆ˜μ •)
95
+
96
+ ---
97
+
98
+ ## μ‚¬μš© 방법(Gradio UI)
99
+ 1) 데이터셋 νƒ­
100
+ - DEV/TEST CSV λ‹€μš΄λ‘œλ“œ
101
+
102
+ 2) 제좜 및 평가 νƒ­
103
+ - μ—…λ‘œλ“œ: TEST CSV에 `model_response`κ°€ μ±„μ›Œμ§„ 파일
104
+ - μž…λ ₯: 제좜자 이름, μ‚¬μš© λͺ¨λΈ, μ„€λͺ…
105
+ - 평가: Upstage Solar λͺ¨λΈλ‘œ Relaxed/Strict λ™μ‹œ μˆ˜ν–‰
106
+ - 좜λ ₯: 전체/μ„ΈλΆ€ μ§€ν‘œκ°€ κ³„μ‚°λ˜μ–΄ λ¦¬λ”λ³΄λ“œμ— 반영
107
+
108
+ 3) λ¦¬λ”λ³΄λ“œ νƒ­
109
+ - 제좜 κ²°κ³Όκ°€ `data/leaderboard_results.csv`에 λˆ„μ 
110
+ - 검색/μƒˆλ‘œκ³ μΉ¨ κ°€λŠ₯
111
+
112
+ ---
113
+
114
+ ## λ™μž‘ 흐름(λ‚΄λΆ€)
115
+ 1) 제좜 μ ‘μˆ˜: `src/submission_handler.py::process_submission`
116
+ 2) μ‚¬μš©μž CSV λ‘œλ“œ β†’ κΈ°μ€€ 데이터와 병합:
117
+ - `freshqa/merge_csv_with_model_response.py::merge_dataframe_with_model_response_df`
118
+ 3) 평가:
119
+ - `freshqa/fresheval_parallel.py::evaluate_dataframe` β†’ `freshqa/fresheval.py::FreshEval`
120
+ 4) 정확도 집계:
121
+ - `freshqa/freshqa_acc.py::calculate_accuracy`, `process_freshqa_dataframe`
122
+ 5) μ €μž₯:
123
+ - λ¦¬λ”λ³΄λ“œ: `src/leaderboard_manager.py::append_to_leaderboard_data`
124
+ - (μ˜΅μ…˜) 제좜 이λ ₯: `src/submission_tracker.py` (ENABLE_SUBMISSION_LIMIT=true 일 λ•Œλ§Œ)
125
+
126
+ 주의: `ENABLE_SUBMISSION_LIMIT=false`인 경우, 제좜 이λ ₯ μΆ”μ μš© Hugging Face μ €μž₯μ†Œ 접근을 μ‹œλ„ν•˜μ§€ μ•Šλ„λ‘ μ½”λ“œκ°€ λ°˜μ˜λ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€.
127
+
128
+ ---
129
+
130
+ ## 제좜 μ œν•œ(μ˜΅μ…˜)
131
+ - μ„€μ •: `ENABLE_SUBMISSION_LIMIT=true`(κΈ°λ³Έ)
132
+ - μ €μž₯μ†Œ: `SUBMISSION_TRACKER_REPO_ID`에 `user_submissions.json` 관리
133
+ - 둜직:
134
+ - ν•œ μ‚¬μš©μž ν•˜λ£¨ 3회 성곡 μ œμΆœκΉŒμ§€ 카운트
135
+ - ν•œκ΅­ μ‹œκ°„ κΈ°μ€€ 00:00에 일자 λ‹¨μœ„λ‘œ 카운트
136
+ - λΉ„ν™œμ„±ν™” μ‹œ(HF μ €μž₯μ†Œ μ ‘κ·Ό μ—†μŒ): `SubmissionHandler`κ°€ 좔적기λ₯Ό μƒμ„±ν•˜μ§€ μ•ŠμŒ
137
+
138
+ ---
139
+
140
+ ## νŠΈλŸ¬λΈ”μŠˆνŒ…
141
+ - μ‹œμž‘ μ‹œ β€œν•„μˆ˜ μ„€μ • λˆ„λ½β€ 였λ₯˜
142
+ - `.env`μ—μ„œ `UPSTAGE_API_KEY(or KEYS)`, `HF_TOKEN`, `FRESHQA_DATA_REPO_ID` 확인
143
+ - 제좜 μ œν•œ λΉ„ν™œμ„±ν™”μΈλ° HF 404 κ²½κ³ κ°€ λ³΄μž„
144
+ - ν˜„ 버전은 `ENABLE_SUBMISSION_LIMIT=false`일 λ•Œ 제좜 좔적기λ₯Ό μ΄ˆκΈ°ν™”ν•˜μ§€ μ•Šλ„λ‘ μˆ˜μ •λ¨
145
+ - HF 404 (제좜 μ œν•œ ν™œμ„±ν™”)
146
+ - `SUBMISSION_TRACKER_REPO_ID` μ €μž₯μ†Œμ— `user_submissions.json`이 μ—†μœΌλ©΄ 졜초 μ ‘κ·Ό μ‹œ 404κ°€ λ‚  수 μžˆμŠ΅λ‹ˆλ‹€. νŒŒμΌμ„ 빈 JSON `{}`으둜 생성해 λ‘μ„Έμš”.
147
+
148
+ ---
149
+
150
+ ## λΌμ΄μ„ μŠ€/좜처
151
+ - λ³Έ λ¦¬λ”λ³΄λ“œλŠ” FreshQAμ—μ„œ μ˜κ°μ„ λ°›μ•„ μ œμž‘λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
152
+
153
+ 문의 사항은 이슈둜 등둝해 μ£Όμ„Έμš”.
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ko-FreshQA Leaderboard 메인 μ• ν”Œλ¦¬μΌ€μ΄μ…˜
3
+
4
+ Gradio 기반의 μ›Ή μΈν„°νŽ˜μ΄μŠ€λ₯Ό μ œκ³΅ν•©λ‹ˆλ‹€.
5
+ """
6
+
7
+ import os
8
+ import gradio as gr
9
+ from config import Config
10
+ from ui.leaderboard_tab import create_leaderboard_tab
11
+ from ui.submission_tab import create_submission_tab
12
+ from ui.dataset_tab import create_dataset_tab
13
+
14
+
15
+ def load_css():
16
+ """CSS 파일 λ‘œλ“œ"""
17
+ current_dir = os.path.dirname(os.path.abspath(__file__))
18
+ css_path = os.path.join(current_dir, 'ui', 'styles.css')
19
+
20
+ try:
21
+ with open(css_path, 'r', encoding='utf-8') as f:
22
+ return f.read()
23
+ except FileNotFoundError:
24
+ print("⚠️ CSS νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {css_path}")
25
+ raise FileNotFoundError(f"CSS νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {css_path}")
26
+
27
+
28
+ def create_interface():
29
+ """메인 μΈν„°νŽ˜μ΄μŠ€ 생성"""
30
+
31
+ css_content = load_css()
32
+
33
+ with gr.Blocks(
34
+ title="Ko-FreshQA Leaderboard",
35
+ theme=gr.themes.Soft(),
36
+ css=css_content
37
+ ) as app:
38
+ gr.Markdown("# Ko-FreshQA Leaderboard")
39
+
40
+ with gr.Tabs():
41
+ # λ¦¬λ”λ³΄λ“œ νƒ­
42
+ with gr.Tab("πŸ† λ¦¬λ”λ³΄λ“œ"):
43
+ create_leaderboard_tab()
44
+
45
+ # 제좜 및 평가 νƒ­
46
+ with gr.Tab("πŸ“€ 제좜 및 평가"):
47
+ create_submission_tab()
48
+
49
+ # 데이터셋 λ‹€μš΄λ‘œλ“œ νƒ­
50
+ with gr.Tab("πŸ’Ύ 데이터셋"):
51
+ create_dataset_tab()
52
+
53
+ return app
54
+
55
+
56
+ def main():
57
+ """메인 μ‹€ν–‰ ν•¨μˆ˜"""
58
+ print("πŸ‡°πŸ‡· ν•œκ΅­μ–΄ FreshQA λ¦¬λ”λ³΄λ“œ μ‹œμž‘ 쀑...")
59
+ print("πŸ“‹ λ¦¬λ”λ³΄λ“œ μ œμΆœμ„ μœ„ν•΄μ„œλŠ” 'πŸ“€ 제좜 및 평가' 탭을 μ‚¬μš©ν•˜μ„Έμš”.")
60
+
61
+ # ν•„μˆ˜ μ„€μ • 검증
62
+ try:
63
+ Config.validate_required_configs()
64
+ print("βœ… ν•„μˆ˜ μ„€μ • 검증 μ™„λ£Œ")
65
+ except ValueError as e:
66
+ print(f"❌ μ„€μ • 였λ₯˜: {e}")
67
+ import sys
68
+ sys.exit(1)
69
+
70
+ app = create_interface()
71
+
72
+ # Hugging Face Spaces ν™˜κ²½ 감지
73
+ is_huggingface_spaces = Config.IS_HUGGINGFACE_SPACES
74
+
75
+ if is_huggingface_spaces:
76
+ print("πŸš€ Hugging Face Spaces ν™˜κ²½μ—μ„œ μ‹€ν–‰ 쀑...")
77
+ app.launch(
78
+ server_name="0.0.0.0",
79
+ server_port=7860,
80
+ share=False,
81
+ debug=False,
82
+ show_error=True
83
+ )
84
+ else:
85
+ print("πŸ’» 둜컬 ν™˜κ²½μ—μ„œ μ‹€ν–‰ 쀑...")
86
+ app.launch(
87
+ server_name="127.0.0.1",
88
+ server_port=7860,
89
+ share=False,
90
+ debug=True,
91
+ show_error=True
92
+ )
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
config.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ μ„€μ • 관리 λͺ¨λ“ˆ
3
+ μ• ν”Œλ¦¬μΌ€μ΄μ…˜μ˜ λͺ¨λ“  섀정을 μ€‘μ•™μ—μ„œ κ΄€λ¦¬ν•©λ‹ˆλ‹€.
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ # .env 파일 λ‘œλ“œ
10
+ load_dotenv()
11
+
12
+ class Config:
13
+ """μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ„€μ • 클래슀"""
14
+
15
+ # API μ„€μ •
16
+ # λ©€ν‹° ν‚€ 지원: UPSTAGE_API_KEYS(콀마 ꡬ뢄) μš°μ„ , μ—†μœΌλ©΄ 단일 ν‚€ μ‚¬μš©
17
+ _UPSTAGE_API_KEYS_RAW = os.getenv('UPSTAGE_API_KEYS')
18
+ if _UPSTAGE_API_KEYS_RAW:
19
+ _parsed_keys = [k.strip() for k in _UPSTAGE_API_KEYS_RAW.split(',') if k.strip()]
20
+ else:
21
+ _single = os.getenv('UPSTAGE_API_KEY')
22
+ _parsed_keys = [
23
+ _single.strip()
24
+ ] if (_single and _single.strip()) else []
25
+
26
+ # 곡개 속성: ν˜Έν™˜μ„±μ„ μœ„ν•΄ 첫 번째 ν‚€λ₯Ό κΈ°μ‘΄ μ΄λ¦„μœΌλ‘œ λ…ΈμΆœ
27
+ UPSTAGE_API_KEYS = _parsed_keys
28
+ UPSTAGE_API_KEY = _parsed_keys[0] if _parsed_keys else None
29
+ HF_TOKEN = os.getenv('HF_TOKEN')
30
+
31
+ # 데이터 μ„€μ •
32
+ FRESHQA_DATA_REPO_ID = os.getenv('FRESHQA_DATA_REPO_ID')
33
+ FRESHQA_DATA_FILENAME = os.getenv('FRESHQA_DATA_FILENAME', 'ko-freshqa_2025_total.csv')
34
+
35
+ # ν”„λ‘¬ν”„νŠΈ μ„€μ •
36
+ # 멀티라인 κ°’ 지원: python-dotenvκ°€ λ‘œλ“œν•œ 원문을 κ·ΈλŒ€λ‘œ μ‚¬μš©
37
+ FRESHQA_PROMPT_RELAXED = os.getenv('FRESHQA_PROMPT_RELAXED')
38
+ FRESHQA_PROMPT_STRICT = os.getenv('FRESHQA_PROMPT_STRICT')
39
+ try:
40
+ from freshqa.freshqa_prompt import FRESHQA_PROMPT_RELAXED, FRESHQA_PROMPT_STRICT
41
+ if not FRESHQA_PROMPT_RELAXED:
42
+ FRESHQA_PROMPT_RELAXED = FRESHQA_PROMPT_RELAXED
43
+ if not FRESHQA_PROMPT_STRICT:
44
+ FRESHQA_PROMPT_STRICT = FRESHQA_PROMPT_STRICT
45
+ except Exception as e:
46
+ pass
47
+
48
+ # 제좜 μ œν•œ μ„€μ •
49
+ ENABLE_SUBMISSION_LIMIT = os.getenv('ENABLE_SUBMISSION_LIMIT', 'true').lower() == 'true'
50
+ SUBMISSION_TRACKER_REPO_ID = os.getenv('SUBMISSION_TRACKER_REPO_ID')
51
+
52
+ # ν™˜κ²½ μ„€μ •
53
+ IS_HUGGINGFACE_SPACES = os.getenv("SPACE_ID") is not None
54
+
55
+ @classmethod
56
+ def validate_required_configs(cls):
57
+ """ν•„μˆ˜ 섀정듀이 λͺ¨λ‘ μžˆλŠ”μ§€ 확인"""
58
+ missing_configs = []
59
+
60
+ # λ©€ν‹°/단일 ν‚€ λͺ¨λ‘ ν—ˆμš©: μ΅œμ†Œ 1개 ν‚€κ°€ μ‘΄μž¬ν•΄μ•Ό 함
61
+ if not cls.UPSTAGE_API_KEYS:
62
+ # λ©”μ‹œμ§€λŠ” 두 λ³€μˆ˜ λͺ¨λ‘ μ•ˆλ‚΄
63
+ missing_configs.append('UPSTAGE_API_KEY or UPSTAGE_API_KEYS')
64
+ if not cls.HF_TOKEN:
65
+ missing_configs.append('HF_TOKEN')
66
+ if not cls.FRESHQA_DATA_REPO_ID:
67
+ missing_configs.append('FRESHQA_DATA_REPO_ID')
68
+ if not cls.FRESHQA_PROMPT_RELAXED:
69
+ missing_configs.append('FRESHQA_PROMPT_RELAXED')
70
+ if not cls.FRESHQA_PROMPT_STRICT:
71
+ missing_configs.append('FRESHQA_PROMPT_STRICT')
72
+
73
+ if missing_configs:
74
+ raise ValueError(f"ν•„μˆ˜ 섀정이 λˆ„λ½λ˜μ—ˆμŠ΅λ‹ˆλ‹€: {', '.join(missing_configs)}")
75
+
76
+ return True
data/leaderboard_results.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ id,model,description,accuracy,fast_changing_accuracy,slow_changing_accuracy,never_changing_accuracy,acc_vp,acc_fp,acc_vp_one_hop,acc_vp_two_hop,acc_fp_one_hop,acc_fp_two_hop,acc_vp_old,acc_vp_new,acc_fp_old,acc_fp_new,acc_politics,acc_sports,acc_entertainment,acc_weather,acc_world,acc_economy,acc_society,acc_it_science,acc_life_culture,acc_unknown,total_questions,evaluation_date,evaluation_mode
data/public/ko-freshqa_2025_dev.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/public/ko-freshqa_2025_test.csv ADDED
The diff for this file is too large to render. See raw diff
 
docker-compose.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ freshqa-leaderboard:
5
+ build: .
6
+ ports:
7
+ - "7860:7860"
8
+ environment:
9
+ - UPSTAGE_API_KEY=${UPSTAGE_API_KEY}
10
+ volumes:
11
+ - ./datasets:/app/datasets
12
+ - ./results:/app/results
13
+ restart: unless-stopped
14
+ healthcheck:
15
+ test: ["CMD", "curl", "-f", "http://localhost:7860/"]
16
+ interval: 30s
17
+ timeout: 10s
18
+ retries: 3
19
+ start_period: 40s
env.example ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===========================================
2
+ # FreshQA Leaderboard ν™˜κ²½λ³€μˆ˜ μ„€μ • μ˜ˆμ‹œ
3
+ # ===========================================
4
+
5
+ # ===========================================
6
+ # Hugging Face μ„€μ •
7
+ # ===========================================
8
+ # Hugging Face API 토큰 (ν•„μˆ˜)
9
+ # - Private repository μ ‘κ·Όμš©
10
+ # - https://huggingface.co/settings/tokens μ—μ„œ λ°œκΈ‰
11
+ HF_TOKEN=your_huggingface_token_here
12
+
13
+ # ===========================================
14
+ # FreshQA κΈ°μ€€ 데이터 μ„€μ •
15
+ # ===========================================
16
+ # κΈ°μ€€ 데이터가 μžˆλŠ” HuggingFace Repository ID (ν•„μˆ˜)
17
+ # ν˜•μ‹: username/repository-name
18
+ FRESHQA_DATA_REPO_ID=james-demo-leaderboard-backend/kofreshqa-data-origin
19
+
20
+ # κΈ°μ€€ 데이터 파일λͺ… (선택사항, κΈ°λ³Έκ°’: ko-freshqa_2025_total.csv)
21
+ FRESHQA_DATA_FILENAME=ko-freshqa_2025_total.csv
22
+
23
+ # ===========================================
24
+ # FreshQA ν”„λ‘¬ν”„νŠΈ μ„€μ •
25
+ # ===========================================
26
+ # Fresheval ν”„λ‘¬ν”„νŠΈ λ³Έλ¬Έ
27
+ # 평가 ν”„λ‘¬ν”„νŠΈλŠ” μ œκ³΅ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.
28
+ FRESHQA_PROMPT_RELAXED=
29
+ FRESHQA_PROMPT_STRICT=
30
+
31
+ # ===========================================
32
+ # 제좜 좔적 μ„€μ •
33
+ # ===========================================
34
+ # 제좜 기둝을 μ €μž₯ν•  HuggingFace Repository ID (ν•„μˆ˜)
35
+ # ν˜•μ‹: username/repository-name
36
+ SUBMISSION_TRACKER_REPO_ID=james-demo-leaderboard-backend/submission-tracker
37
+
38
+ # 제좜 μ œν•œ κΈ°λŠ₯ ν™œμ„±ν™” μ—¬λΆ€ (선택사항, κΈ°λ³Έκ°’: true)
39
+ # - true: 제좜 μ œν•œ κΈ°λŠ₯ ν™œμ„±ν™” (ν•˜λ£¨ 3회 μ œν•œ)
40
+ # - false: 제좜 μ œν•œ κΈ°λŠ₯ λΉ„ν™œμ„±ν™” (둜컬 ν…ŒμŠ€νŠΈμš©)
41
+ ENABLE_SUBMISSION_LIMIT=true
42
+
43
+ # ===========================================
44
+ # AI 평가 API μ„€μ •
45
+ # ===========================================
46
+ # Upstage Solar Pro API ν‚€ (ν•„μˆ˜)
47
+ # - λͺ¨λΈ ν‰κ°€μš©
48
+ # - https://console.upstage.ai/ μ—μ„œ λ°œκΈ‰
49
+ UPSTAGE_API_KEY=your_upstage_api_key_here
50
+
51
+ # μ—¬λŸ¬ 개의 Upstage API ν‚€λ₯Ό μ‚¬μš©ν•  경우(선택사항)
52
+ # - 콀마둜 κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯
53
+ # - μ„€μ • λ‘œλ”λŠ” UPSTAGE_API_KEYSκ°€ μ‘΄μž¬ν•˜λ©΄ 이λ₯Ό μš°μ„  μ‚¬μš©ν•˜κ³ ,
54
+ # μ—†μœΌλ©΄ 단일 λ³€μˆ˜ UPSTAGE_API_KEYλ₯Ό μ‚¬μš©ν•©λ‹ˆλ‹€.
55
+ # μ˜ˆμ‹œ)
56
+ # UPSTAGE_API_KEYS=keyA,keyB,keyC
57
+
58
+ # ===========================================
59
+ # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ„€μ •
60
+ # ===========================================
61
+ # μ„œλ²„ 포트 (선택사항, κΈ°λ³Έκ°’: 7860)
62
+ # PORT=7860
63
+
64
+ # 디버그 λͺ¨λ“œ (선택사항, κΈ°λ³Έκ°’: false)
65
+ # DEBUG=false
66
+
67
+ # ===========================================
68
+ # μ‚¬μš© μ˜ˆμ‹œ
69
+ # ===========================================
70
+ # 1. 이 νŒŒμΌμ„ .env둜 λ³΅μ‚¬ν•˜μ„Έμš”:
71
+ # cp env.example .env
72
+ #
73
+ # 2. μ‹€μ œ κ°’μœΌλ‘œ λ³€κ²½ν•˜μ„Έμš”:
74
+ # - HF_TOKEN: μ‹€μ œ HuggingFace 토큰
75
+ # - FRESHQA_DATA_REPO_ID: μ‹€μ œ repository ID
76
+ # - FRESHQA_DATA_FILENAME: μ‹€μ œ 파일λͺ… (선택사항)
77
+ # - SUBMISSION_TRACKER_REPO_ID: 제좜 μΆ”μ μš© repository ID
78
+ # - ENABLE_SUBMISSION_LIMIT: 제좜 μ œν•œ κΈ°λŠ₯ ν™œμ„±ν™” μ—¬λΆ€ (둜컬 ν…ŒμŠ€νŠΈ μ‹œ false)
79
+ # - UPSTAGE_API_KEY: μ‹€μ œ Upstage API ν‚€
80
+ #
81
+ # 3. Pythonμ—μ„œ μžλ™ λ‘œλ“œλ¨ (app.pyμ—μ„œ load_dotenv() 호좜)
82
+ # λ˜λŠ” μˆ˜λ™μœΌλ‘œ λ‘œλ“œ:
83
+ # from dotenv import load_dotenv
84
+ # load_dotenv()
85
+ #
86
+ # 4. λ˜λŠ” 직접 ν™˜κ²½λ³€μˆ˜ μ„€μ • (μ‹œμŠ€ν…œ 레벨):
87
+ # export HF_TOKEN="your_token"
88
+ # export FRESHQA_DATA_REPO_ID="username/repo"
89
+ # export FRESHQA_DATA_FILENAME="filename.csv"
90
+ # export UPSTAGE_API_KEY="your_api_key"
91
+
92
+ # ===========================================
93
+ # λ³΄μ•ˆ μ£Όμ˜μ‚¬ν•­
94
+ # ===========================================
95
+ # - .env νŒŒμΌμ€ μ ˆλŒ€ Git에 μ»€λ°‹ν•˜μ§€ λ§ˆμ„Έμš”
96
+ # - μ‹€μ œ 토큰과 API ν‚€λŠ” μ•ˆμ „ν•˜κ²Œ λ³΄κ΄€ν•˜μ„Έμš”
97
+ # - ν”„λ‘œλ•μ…˜ ν™˜κ²½μ—μ„œλŠ” ν™˜κ²½λ³€μˆ˜λ‘œ 직접 μ„€μ •ν•˜μ„Έμš”
environment.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: freshqa-leaderboard
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python=3.9
7
+ - pip
8
+ - numpy>=1.24.0
9
+ - pandas>=2.0.0
10
+ - requests>=2.25.0
11
+ - pip:
12
+ - gradio>=5.0.0
13
+ - plotly>=5.0.0
14
+ - pytz>=2023.3
15
+ - python-dateutil>=2.8.0
16
+ - openpyxl>=3.0.0
17
+ - httpx>=0.24.0
18
+ - seaborn>=0.12.0
19
+ - matplotlib>=3.7.0
20
+ - tqdm>=4.65.0
21
+ - huggingface_hub<1.0.0
freshqa/fresheval.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ from openai import OpenAI
4
+ from typing import List, Dict, Any, Tuple
5
+ import time
6
+ import random
7
+
8
+ from config import Config
9
+ from src.utils import get_current_date_str
10
+
11
+ class FreshEval:
12
+
13
+ def __init__(self, model: str='solar-pro2', api_key: str=None):
14
+ self.model = model
15
+ self.api_key = api_key or Config.UPSTAGE_API_KEY
16
+ self.client = OpenAI(
17
+ api_key=self.api_key,
18
+ base_url="https://api.upstage.ai/v1/solar"
19
+ )
20
+
21
+ self.temperature = 0.0
22
+ self.max_tokens = 256
23
+ self.chat_completions = True
24
+
25
+ if model.startswith('gpt-4') | model.startswith('solar'):
26
+ self.num_organic_results = 15
27
+ self.num_related_questions = 3
28
+ self.num_questions_and_answers = 3
29
+ self.num_retrieved_evidences = 15
30
+ else:
31
+ self.num_organic_results = 15
32
+ self.num_related_questions = 2
33
+ self.num_questions_and_answers = 2
34
+ self.num_retrieved_evidences = 5
35
+
36
+
37
+ def _is_rate_limit_error(self, error: Exception) -> bool:
38
+ """429 μ—λŸ¬ 감지 ν•¨μˆ˜"""
39
+ error_str = str(error)
40
+ error_type = type(error).__name__
41
+
42
+ # 1. HTTP μƒνƒœ μ½”λ“œ 확인
43
+ if hasattr(error, 'response') and hasattr(error.response, 'status_code'):
44
+ if error.response.status_code == 429:
45
+ print(f"βœ… HTTP 429 μ—λŸ¬ 감지: {error.response.status_code}")
46
+ return True
47
+
48
+ # 2. ν…μŠ€νŠΈ 기반 감지 (λ°±μ—…)
49
+ error_lower = error_str.lower()
50
+ if ("429" in error_lower or
51
+ "rate" in error_lower or
52
+ "limit" in error_lower or
53
+ "too_many_requests" in error_lower or
54
+ "request limit" in error_lower):
55
+ # print(f"βœ… ν…μŠ€νŠΈ 기반 429 μ—λŸ¬ 감지")
56
+ return True
57
+
58
+ return False
59
+
60
+
61
+ def call_llm_api(self, prompt:str, current_date:str) -> str:
62
+ """LLM API 호좜 ν•¨μˆ˜ (ν‚€ νšŒμ „ 및 λ°±μ˜€ν”„ 지원)"""
63
+ from src.api_key_rotator import get_rotator
64
+
65
+ rotator = get_rotator()
66
+ num_keys = len(rotator.keys)
67
+ base_delay = 3.0
68
+
69
+ def _make_api_call(eval_instance: FreshEval) -> str:
70
+ """API 호좜 헬퍼 ν•¨μˆ˜"""
71
+ if eval_instance.chat_completions:
72
+ # Chat completions API
73
+ response = eval_instance.client.chat.completions.create(
74
+ model=eval_instance.model,
75
+ temperature=eval_instance.temperature,
76
+ max_tokens=eval_instance.max_tokens,
77
+ messages=[
78
+ {
79
+ "role": "system",
80
+ "content": (
81
+ f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}."
82
+ )
83
+ },
84
+ {
85
+ "role": "user",
86
+ "content": "What's today's date?"
87
+ },
88
+ {
89
+ "role": "assistant",
90
+ "content": f"Today is {current_date} in Pacific Standard Time."
91
+ },
92
+ {
93
+ "role": "user",
94
+ "content": prompt
95
+ }
96
+ ],
97
+ )
98
+ return response.choices[0].message.content
99
+ else:
100
+ # Completions API
101
+ response = eval_instance.client.completions.create(
102
+ model=eval_instance.model,
103
+ temperature=eval_instance.temperature,
104
+ max_tokens=eval_instance.max_tokens,
105
+ prompt=prompt,
106
+ )
107
+ return response.choices[0].text
108
+
109
+ # ν˜„μž¬ ν‚€λ‘œ μ‹œμž‘
110
+ current_key = self.api_key
111
+ current_instance = FreshEval(model=self.model, api_key=current_key)
112
+
113
+ # ν‚€κ°€ 1개인 경우: κΈ°μ‘΄ λ°±μ˜€ν”„ 둜직만 μ‚¬μš©
114
+ if num_keys == 1:
115
+ max_retries = 7
116
+ for attempt in range(max_retries):
117
+ try:
118
+ return _make_api_call(current_instance)
119
+ except Exception as e:
120
+ if self._is_rate_limit_error(e):
121
+ if attempt < max_retries - 1:
122
+ # μ§€μˆ˜μ  λ°±μ˜€ν”„
123
+ delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
124
+ time.sleep(delay)
125
+ continue
126
+ else:
127
+ print(f"❌ μ΅œλŒ€ μž¬μ‹œλ„ 횟수 초과")
128
+ raise e
129
+
130
+ # max_retries μ΄ˆκ³Όν•  λ•ŒκΉŒμ§€ return λ˜μ§€ μ•ŠμœΌλ©΄ μ—λŸ¬ λ°œμƒ
131
+ raise Exception("call llm api:μ΅œλŒ€ μž¬μ‹œλ„ 횟수 초과")
132
+
133
+ # ν‚€κ°€ 2개 이상인 경우: ν‚€ μ „ν™˜ 둜직 (3초 λŒ€κΈ° 포함)
134
+ # 성곡할 λ•ŒκΉŒμ§€ ν‚€λ₯Ό μˆœν™˜ν•˜λ©° μ‹œλ„ (μ΅œλŒ€ λͺ¨λ“  ν‚€λ₯Ό 3λ°”ν€΄κΉŒμ§€)
135
+ max_attempts = num_keys * 3 # λͺ¨λ“  ν‚€λ₯Ό μ΅œλŒ€ 3λ°”ν€΄κΉŒμ§€ μ‹œλ„
136
+ key_attempt_count = 0
137
+
138
+ # ν˜„μž¬ ν‚€λ‘œ 첫 μ‹œλ„
139
+ for attempt in range(max_attempts):
140
+ try:
141
+ return _make_api_call(current_instance) # μ„±κ³΅ν•˜λ©΄ μ¦‰μ‹œ λ°˜ν™˜
142
+ except Exception as e:
143
+ if self._is_rate_limit_error(e):
144
+ key_attempt_count += 1
145
+ # λ‹€μŒ ν‚€λ‘œ μ „ν™˜ν•˜κΈ° 전에 2초 λŒ€κΈ°
146
+ time.sleep(2)
147
+ current_key = rotator.pick_key()
148
+ # print("πŸ”‘ ν‚€ μ „ν™˜")
149
+ current_instance = FreshEval(model=self.model, api_key=current_key)
150
+ continue # λ‹€μŒ ν‚€λ‘œ 계속 μ‹œλ„
151
+ else:
152
+ # 429κ°€ μ•„λ‹Œ μ—λŸ¬λŠ” μ¦‰μ‹œ μ „νŒŒ
153
+ raise
154
+
155
+ # μ΅œλŒ€ μ‹œλ„ 횟수 초과 (λͺ¨λ“  ν‚€λ₯Ό μ—¬λŸ¬ 바퀴 μ‹œλ„ν–ˆμ§€λ§Œ λͺ¨λ‘ μ‹€νŒ¨)
156
+ raise Exception(f"λͺ¨λ“  API ν‚€μ—μ„œ 429 μ—λŸ¬ λ°œμƒ (μ΅œλŒ€ {max_attempts}회 μ‹œλ„)")
157
+
158
+
159
+ def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str:
160
+ """FreshEval 평가 ν•¨μˆ˜"""
161
+
162
+ fresheval_question = f'\nquestion: {question}{evaluation}'
163
+
164
+ # ν™˜κ²½λ³€μˆ˜ 기반 ν”„λ‘¬ν”„νŠΈ(본체: prefix + demo) μš°μ„  μ‚¬μš©
165
+ env_prompt_body = None
166
+ if mode == 'Relaxed':
167
+ env_prompt_body = Config.FRESHQA_PROMPT_RELAXED
168
+ elif mode == 'Strict':
169
+ env_prompt_body = Config.FRESHQA_PROMPT_STRICT
170
+
171
+ if env_prompt_body and str(env_prompt_body).strip():
172
+ base_prompt = str(env_prompt_body).strip()
173
+ else:
174
+ raise ValueError(f"{mode} 평가 ν”„λ‘¬ν”„νŠΈ 섀정이 μ—†μŠ΅λ‹ˆλ‹€.")
175
+
176
+ fresheval_prompt = base_prompt + fresheval_question
177
+
178
+ # 평가
179
+ answer = self.call_llm_api(fresheval_prompt, current_date)
180
+
181
+ return answer
182
+
183
+
184
+ def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]:
185
+ """평가 κ²°κ³Όμ—μ„œ λ“±κΈ‰ μΆ”μΆœ"""
186
+ def _clean(text: str) -> str:
187
+ # 양끝 μž₯식/곡백 제거 + λ‚΄λΆ€ 흔적 정리 + μ†Œλ¬Έμžν™”
188
+ text = re.sub(r'^[*`_~\s]+|[*`_~\s]+$', '', text)
189
+ text = re.sub(r'[*`_~]', '', text)
190
+ return text.strip().strip('.').strip().lower()
191
+
192
+ def _judge(val: str):
193
+ """
194
+ λ¬Έμžμ—΄μ—μ„œ correct/incorrect νŒμ •.
195
+ - 'incorrect'κ°€ 보이면 무쑰건 FALSE
196
+ - 'partially correct'λŠ” λͺ¨ν˜Έ β†’ None
197
+ - 'correct'λŠ” TRUE
198
+ """
199
+ if re.search(r'(?i)\bincorrect\b', val):
200
+ return 'FALSE'
201
+ if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val):
202
+ return None
203
+ if re.search(r'(?i)\bcorrect\b', val):
204
+ return 'TRUE'
205
+ return None
206
+
207
+ def _from_label(block_label: str):
208
+ """
209
+ 라벨(예: 'Final Evaluation' λ˜λŠ” 'Evaluation') κΈ°μ€€μœΌλ‘œ
210
+ - 같은 쀄 캑처 λ¨Όμ € μ‹œλ„
211
+ - μ‹€νŒ¨ν•˜λ©΄ 라벨 이후 ~ λ‹€μŒ 빈 쀄 이전 λ²”μœ„μ—μ„œ νŒμ • ν‚€μ›Œλ“œ 탐색
212
+ """
213
+ # 같은 쀄 캑처: 라벨 Β± μž₯식 Β± 콜둠 이후 ~ 쀄끝
214
+ same_line = re.search(
215
+ rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)\s*([^\r\n]+)',
216
+ response
217
+ )
218
+ if same_line:
219
+ val = _clean(same_line.group(1))
220
+ j = _judge(val)
221
+ if j is not None:
222
+ return j
223
+
224
+ # μœ„μΉ˜λ§Œ μ°Ύκ³ (κ°’ 없이 μ€„λ°”κΏˆλœ μΌ€μ΄μŠ€), λ‹€μŒ 빈 쀄(or μ„Ήμ…˜) μ „κΉŒμ§€ μŠ€μΊ”
225
+ pos = re.search(
226
+ rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)',
227
+ response
228
+ )
229
+ if pos:
230
+ tail = response[pos.end():]
231
+ # λ‹€μŒ '빈 쀄(연속 κ°œν–‰)' λ˜λŠ” λ‹€μŒ μ„Ήμ…˜ μ‹œμž‘ μ „κΉŒμ§€λ§Œ λ³Έλ‹€ (λ„ˆλ¬΄ 멀리 μ•ˆκ°€λ„λ‘)
232
+ m_stop = re.search(r'\n\s*\n', tail)
233
+ segment = tail[:m_stop.start()] if m_stop else tail[:300] # μ•ˆμ „ν•œ μƒν•œ
234
+ seg_clean = _clean(segment)
235
+ j = _judge(seg_clean)
236
+ if j is not None:
237
+ return j
238
+ return None
239
+
240
+ # 1) Final Evaluation μ΅œμš°μ„ 
241
+ final_judgement = _from_label('final\s+evaluation')
242
+ if final_judgement:
243
+ return True, {'rating': final_judgement}
244
+
245
+ # 2) Evaluation
246
+ eval_judgement = _from_label('evaluation')
247
+ if eval_judgement:
248
+ return True, {'rating': eval_judgement}
249
+
250
+ # 3) 폴백: credited λ¬Έμž₯
251
+ if re.search(r'(?i)thus,\s*the\s*response\s*is\s*credited\b', response):
252
+ return True, {'rating': 'TRUE'}
253
+ if re.search(r'(?i)thus,\s*the\s*response\s*is\s*not\s*credited\b', response):
254
+ return True, {'rating': 'FALSE'}
255
+
256
+ # 4) μ‹€νŒ¨
257
+ return False, {'rating': None}
258
+
259
+
260
+ def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]:
261
+ """단일 ν–‰ 평가"""
262
+ question = row['question']
263
+ response = row['model_response']
264
+ correct_answers = [row[f'answer_{i}'] for i in range(10)]
265
+ correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()]
266
+
267
+
268
+ # model_responseκ°€ λΉ„μ–΄μžˆκ±°λ‚˜ NaN인 경우 λ°”λ‘œ ν‹€λ Έλ‹€λŠ” 결과둜 μ²˜λ¦¬ν•˜κ³  return
269
+ if pd.isna(response) or (isinstance(response, str) and response.strip() == ''):
270
+ # print('model_responseκ°€ λΉ„μ–΄μžˆμŒ. rating=0으둜 처리')
271
+ row_dict = row.to_dict()
272
+ row_dict['rating'] = 0
273
+ row_dict['explanation'] = "model_responseκ°€ λΉ„μ–΄μžˆμŒ"
274
+ return row_dict
275
+
276
+ # 평가 ν…œν”Œλ¦Ώ 생성
277
+ evaluation_template = (
278
+ "\ncorrect answer(s): {correct_answers}"
279
+ "\nresponse: {response}"
280
+ "\ncomment: "
281
+ )
282
+ evaluation = evaluation_template.format(
283
+ correct_answers=' | '.join(correct_answers),
284
+ response=response,
285
+ )
286
+
287
+ # 평가
288
+ fresheval_response = self.call_fresheval(
289
+ mode=mode,
290
+ question=question,
291
+ evaluation=evaluation,
292
+ current_date=current_date
293
+ )
294
+
295
+ is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
296
+
297
+ # if is_valid_eval:
298
+ # print('μ™„λ£Œ')
299
+
300
+ # μž¬ν‰κ°€ 횟수 μ œν•œ (μ΅œλŒ€ 5회)
301
+ max_retries = 5
302
+ retry_count = 0
303
+
304
+ # μž¬μ‹œλ„ loop
305
+ while not is_valid_eval and retry_count < max_retries:
306
+ retry_count += 1
307
+ # print(f'μœ νš¨ν•˜μ§€ μ•Šμ€ 평가, μž¬ν‰κ°€ 쀑... ({retry_count}/{max_retries})\n response: {fresheval_response}')
308
+
309
+ fresheval_response = self.call_fresheval(
310
+ mode=mode,
311
+ question=question,
312
+ evaluation=evaluation,
313
+ current_date=current_date
314
+ )
315
+
316
+ is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
317
+ # if is_valid_eval:
318
+ # print('μ™„λ£Œ')
319
+
320
+ # μ΅œλŒ€ μž¬μ‹œλ„ 횟수 초과 μ‹œ κΈ°λ³Έ κ°’ μ‚¬μš©
321
+ if not is_valid_eval:
322
+ print(f'⚠️ μ΅œλŒ€ μž¬μ‹œλ„ 횟수({max_retries}) 초과. κΈ°λ³Έκ°’ μ‚¬μš©: rating=0')
323
+ eval_result = {'rating': 0}
324
+ fresheval_response = "μž¬μ‹œλ„ 횟수 초과둜 μΈν•œ κΈ°λ³Έ 평가"
325
+
326
+ row_dict = row.to_dict()
327
+ row_dict['rating'] = eval_result['rating']
328
+ row_dict['explanation'] = fresheval_response
329
+
330
+ # πŸ“Š DEBUG: FALSE인 κ²½μš°μ—λ§Œ 상세 좜λ ₯
331
+ # if eval_result['rating'] == 'FALSE':
332
+ # print(f"\n{'='*80}")
333
+ # print(f"❌ FALSE ν‰κ°€λœ 질문")
334
+ # print(f" Mode: {mode}")
335
+ # print(f" Question: {question}")
336
+ # print(f" Correct Answers: {' | '.join(correct_answers)}")
337
+ # print(f" Model Response: {response}")
338
+ # print(f"\n LLM 평가 응닡:")
339
+ # print(f" {fresheval_response}")
340
+ # print(f" μ΅œμ’… Rating: {eval_result['rating']}")
341
+ # print(f"{'='*80}\n")
342
+
343
+ return row_dict
344
+
345
+
346
+ def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame:
347
+ """λ°μ΄ν„°ν”„λ ˆμž„ 평가"""
348
+
349
+ freshevals = []
350
+ current_date = get_current_date_str()
351
+
352
+ len_df = len(df)
353
+ for index, row in df.iterrows():
354
+ print(f'{mode} 평가 쀑... {index+1}/{len_df}')
355
+ row_dict = self.evaluate_single_row(row, mode, current_date)
356
+ freshevals.append(row_dict)
357
+
358
+ return pd.DataFrame(freshevals)
freshqa/fresheval_parallel.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from typing import Dict, Any
4
+ import time
5
+ import queue
6
+
7
+ from freshqa.fresheval import FreshEval
8
+ from src.api_key_rotator import get_rotator
9
+ from src.utils import get_current_date_str
10
+
11
+
12
+ class FreshEvalParallel:
13
+ """병렬 처리λ₯Ό μœ„ν•œ FreshEval 래퍼 클래슀"""
14
+
15
+ def __init__(self, model: str = 'solar-pro2', max_workers: int = 4):
16
+ self.model = model
17
+ self.max_workers = max_workers
18
+
19
+ def evaluate_dataframe(self, df: pd.DataFrame, mode: str, progress_queue: "queue.Queue[int] | None" = None, on_item_done=None) -> pd.DataFrame:
20
+ """병렬 처리λ₯Ό ν†΅ν•œ λ°μ΄ν„°ν”„λ ˆμž„ 평가 (μ§„ν–‰λ₯  ν‘œμ‹œ)"""
21
+ current_date = get_current_date_str()
22
+ total_rows = len(df)
23
+
24
+ # print(f"πŸš€ {mode} λͺ¨λ“œ 평가 μ‹œμž‘: {total_rows}개 ν–‰, {self.max_workers}개 μ›Œμ»€")
25
+
26
+ # μ›Œμ»€λ³„ 인자 μ€€λΉ„
27
+ worker_args = []
28
+ for index, row in df.iterrows():
29
+ worker_args.append((row, mode, current_date))
30
+
31
+ # 병렬 처리 (μ§„ν–‰λ₯  ν‘œμ‹œ)
32
+ results = [None] * total_rows # 미리 크기 ν• λ‹Ή
33
+ completed_count = 0
34
+ start_time = time.time()
35
+
36
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
37
+ # λͺ¨λ“  μž‘μ—… 제좜
38
+ future_to_index = {
39
+ executor.submit(self._evaluate_single_row_worker, args): i
40
+ for i, args in enumerate(worker_args)
41
+ }
42
+
43
+ # μ™„λ£Œλœ μž‘μ—…λ“€μ„ μˆœμ„œλŒ€λ‘œ 처리
44
+ for future in as_completed(future_to_index):
45
+ original_index = future_to_index[future]
46
+ try:
47
+ result = future.result()
48
+ results[original_index] = result
49
+ completed_count += 1
50
+
51
+ # progress_queue에 μ§„ν–‰λ₯  반영 (Gradio UI μ—…λ°μ΄νŠΈμš©)
52
+ if progress_queue is not None:
53
+ progress_queue.put(1)
54
+
55
+ # on_item_done 콜백 호좜 (μ˜΅μ…˜)
56
+ if on_item_done:
57
+ on_item_done(original_index, result)
58
+
59
+ # μ§„ν–‰λ₯  ν‘œμ‹œ
60
+ progress_percent = (completed_count / total_rows) * 100
61
+ elapsed_time = time.time() - start_time
62
+
63
+ # 10% λ‹¨μœ„λ‘œ ν‘œμ‹œ (μ΅œμ†Œ 10개 λ‹¨μœ„ 보μž₯)
64
+ # total_rows // 10 = 10%에 ν•΄λ‹Ήν•˜λŠ” 개수 (예: 3000ν–‰ β†’ 300개)
65
+ # max(1, ...)으둜 μ΅œμ†Œ 1κ°œλ§ˆλ‹€ 보μž₯
66
+ print_interval = max(10, total_rows // 10) # μ΅œμ†Œ 10개, 10% λ‹¨μœ„
67
+ if (completed_count % print_interval == 0 or
68
+ completed_count == total_rows):
69
+ remaining_time = (elapsed_time / completed_count) * (total_rows - completed_count) if completed_count > 0 else 0
70
+ # print(f"πŸ“Š {mode} λͺ¨λ“œ μ§„ν–‰λ₯ : {progress_percent:.1f}% ({completed_count}/{total_rows}) - "
71
+ # f"κ²½κ³Ό: {elapsed_time:.1f}초, μ˜ˆμƒ 남은 μ‹œκ°„: {remaining_time:.1f}초")
72
+ pass
73
+
74
+ except Exception as e:
75
+ print(f"❌ 평가 μ‹€νŒ¨ (ν–‰ {original_index}): {e}")
76
+ # μ‹€νŒ¨ν•œ 경우 κΈ°λ³Έκ°’ λ°˜ν™˜
77
+ results[original_index] = {
78
+ 'rating': 0,
79
+ 'explanation': f"평가 μ‹€νŒ¨: {str(e)}"
80
+ }
81
+ completed_count += 1
82
+
83
+ # μ‹€νŒ¨ν•΄λ„ 큐에 μ§„ν–‰λ₯  반영
84
+ if progress_queue is not None:
85
+ progress_queue.put(1)
86
+
87
+ total_time = time.time() - start_time
88
+ print(f"βœ… {mode} λͺ¨λ“œ 평가 μ™„λ£Œ: {total_time:.1f}초 μ†Œμš”")
89
+
90
+ return pd.DataFrame(results)
91
+
92
+ def _evaluate_single_row_worker(self, args: tuple) -> Dict[str, Any]:
93
+ """μ›Œμ»€ ν•¨μˆ˜ - 각 μ›Œμ»€λ§ˆλ‹€ 독립적인 FreshEval μΈμŠ€ν„΄μŠ€ 생성"""
94
+ row, mode, current_date = args
95
+
96
+ # 각 μ›Œμ»€λ§ˆλ‹€ 독립적인 FreshEval μΈμŠ€ν„΄μŠ€ 생성 (λ‘œν…Œμ΄ν„°λ‘œ ν‚€ λΆ„λ°°)
97
+ api_key = get_rotator().pick_key()
98
+ worker_eval = FreshEval(model=self.model, api_key=api_key)
99
+
100
+ # κΈ°μ‘΄ evaluate_single_row λ©”μ„œλ“œ μ‚¬μš©
101
+ return worker_eval.evaluate_single_row(row, mode, current_date)
102
+
103
+
104
+ # 편의 ν•¨μˆ˜
105
+ def evaluate_dataframe_parallel(
106
+ df: pd.DataFrame,
107
+ mode: str,
108
+ on_item_done=None,
109
+ progress_queue: "queue.Queue[int] | None" = None,
110
+ max_workers: int = 4) -> pd.DataFrame:
111
+ """병렬 처리λ₯Ό ν†΅ν•œ λ°μ΄ν„°ν”„λ ˆμž„ 평가 (편의 ν•¨μˆ˜)"""
112
+ parallel_eval = FreshEvalParallel(model='solar-pro2', max_workers=max_workers)
113
+ return parallel_eval.evaluate_dataframe(df, mode, progress_queue, on_item_done)
freshqa/freshqa_acc.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ FreshQA 정확도 계산 슀크립트
4
+
5
+ 이 μŠ€ν¬λ¦½νŠΈλŠ” FreshQA λ°μ΄ν„°μ…‹μ˜ 정확도λ₯Ό κ³„μ‚°ν•˜κ³  λ‹€μ–‘ν•œ μΉ΄ν…Œκ³ λ¦¬λ³„λ‘œ λΆ„μ„ν•©λ‹ˆλ‹€.
6
+ """
7
+
8
+ import pandas as pd
9
+ import sys
10
+ import os
11
+
12
+
13
+ def load_freshqa_data(csv_path='freshqa.csv'):
14
+ """FreshQA CSV νŒŒμΌμ„ λ‘œλ“œν•©λ‹ˆλ‹€."""
15
+ try:
16
+ # λ¨Όμ € νŒŒμΌμ„ μ½μ–΄μ„œ ꡬ쑰λ₯Ό 확인
17
+ temp_df = pd.read_csv(csv_path)
18
+ # print(f"파일 ꡬ쑰 확인: {len(temp_df)}개 ν–‰, 컬럼: {temp_df.columns.tolist()}")
19
+
20
+ # rating 컬럼이 있으면 κ·ΈλŒ€λ‘œ μ‚¬μš©, μ—†μœΌλ©΄ skiprows 적용
21
+ if 'rating' in temp_df.columns:
22
+ fresh_qa = temp_df
23
+ # print("rating 컬럼이 μžˆλŠ” 파일둜 μΈμ‹ν•˜μ—¬ 전체 데이터λ₯Ό μ‚¬μš©ν•©λ‹ˆλ‹€.")
24
+ else:
25
+ fresh_qa = pd.read_csv(csv_path, skiprows=[0, 1])
26
+ # print("κΈ°λ³Έ FreshQA ν˜•μ‹μœΌλ‘œ μΈμ‹ν•˜μ—¬ skiprowsλ₯Ό μ μš©ν•©λ‹ˆλ‹€.")
27
+
28
+ # print(f"FreshQA 데이터 λ‘œλ“œ μ™„λ£Œ: {len(fresh_qa)}개 μƒ˜ν”Œ")
29
+ return fresh_qa
30
+ except FileNotFoundError:
31
+ print(f"였λ₯˜: {csv_path} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
32
+ print("ν˜„μž¬ 디렉토리에 freshqa.csv 파일이 μžˆλŠ”μ§€ ν™•μΈν•΄μ£Όμ„Έμš”.")
33
+ sys.exit(1)
34
+ except Exception as e:
35
+ print(f"데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
36
+ sys.exit(1)
37
+
38
+
39
+ def process_freshqa_dataframe(df):
40
+ """DataFrame을 λ§€κ°œλ³€μˆ˜λ‘œ λ°›μ•„ FreshQA 데이터λ₯Ό μ²˜λ¦¬ν•©λ‹ˆλ‹€."""
41
+ try:
42
+ # print(f"DataFrame ꡬ쑰 확인: {len(df)}개 ν–‰, 컬럼: {df.columns.tolist()}")
43
+
44
+ # rating 컬럼이 있으면 κ·ΈλŒ€λ‘œ μ‚¬μš©, μ—†μœΌλ©΄ κΈ°λ³Έκ°’ μ„€μ •
45
+ if 'rating' in df.columns:
46
+ # print("DataFrame에 rating 컬럼이 μžˆμ–΄μ„œ κ·ΈλŒ€λ‘œ μ‚¬μš©ν•©λ‹ˆλ‹€.")
47
+ processed_df = df.copy()
48
+ else:
49
+ # print("DataFrame에 rating 컬럼이 μ—†μ–΄μ„œ κΈ°λ³Έκ°’ 0으둜 μ„€μ •ν•©λ‹ˆλ‹€.")
50
+ processed_df = df.copy()
51
+ processed_df['rating'] = 0 # κΈ°λ³Έκ°’μœΌλ‘œ 0 μ„€μ •
52
+
53
+ print(f"FreshQA 데이터 처리 μ™„λ£Œ: {len(processed_df)}개 μƒ˜ν”Œ")
54
+ return processed_df
55
+ except Exception as e:
56
+ print(f"데이터 처리 쀑 였λ₯˜ λ°œμƒ: {e}")
57
+ raise
58
+
59
+
60
+ def update_results(df, d_acc, d_count, field_name):
61
+ """κ²°κ³Ό λ”•μ…”λ„ˆλ¦¬λ₯Ό μ—…λ°μ΄νŠΈν•©λ‹ˆλ‹€."""
62
+ if len(df) == 0:
63
+ r = 0.0
64
+ else:
65
+ # rating이 λ¬Έμžμ—΄ 'TRUE'μ΄κ±°λ‚˜ 숫자 1인 경우λ₯Ό λͺ¨λ‘ 처리
66
+ if df['rating'].dtype == 'object':
67
+ # λ¬Έμžμ—΄μΈ 경우 'TRUE' 확인
68
+ r = len(df[df.rating == 'TRUE']) * 100 / len(df)
69
+ else:
70
+ # 숫자인 경우 1 확인
71
+ r = len(df[df.rating == 1]) * 100 / len(df)
72
+ d_acc[field_name] = r
73
+ d_count[field_name] = len(df)
74
+
75
+
76
+ def calculate_accuracy_simple(fresh_qa):
77
+ """FreshQA λ°μ΄ν„°μ˜ κΈ°λ³Έ 정확도λ₯Ό κ³„μ‚°ν•©λ‹ˆλ‹€ (κ°„λ‹¨ν•œ 버전)."""
78
+ print("정확도 계산 쀑...")
79
+
80
+ # rating 컬럼이 μ—†μœΌλ©΄ κΈ°λ³Έκ°’ 0으둜 μ„€μ •
81
+ if 'rating' not in fresh_qa.columns:
82
+ # print("rating 컬럼이 μ—†μ–΄μ„œ κΈ°λ³Έκ°’ 0으둜 μ„€μ •ν•©λ‹ˆλ‹€.")
83
+ fresh_qa['rating'] = 0
84
+
85
+ accs = {}
86
+ counts = {}
87
+
88
+ # 전체 정확도
89
+ update_results(fresh_qa, accs, counts, 'overall_accuracy')
90
+
91
+ # split 컬럼이 있으면 뢄할별 정확도 계산
92
+ if 'split' in fresh_qa.columns:
93
+ fresh_qa_test = fresh_qa[fresh_qa.split == 'TEST']
94
+ fresh_qa_dev = fresh_qa[fresh_qa.split == 'DEV']
95
+ update_results(fresh_qa_test, accs, counts, 'acc_test')
96
+ update_results(fresh_qa_dev, accs, counts, 'acc_dev')
97
+
98
+ # fact_type 컬럼이 있으면 사싀 μœ ν˜•λ³„ 정확도 계산
99
+ if 'fact_type' in fresh_qa.columns:
100
+ for fact_type in ['fast-changing', 'slow-changing', 'never-changing']:
101
+ if fact_type in fresh_qa['fact_type'].values:
102
+ sub_df = fresh_qa[fresh_qa.fact_type == fact_type]
103
+ update_results(sub_df, accs, counts, f'{fact_type}_accuracy')
104
+
105
+ # false_premise 컬럼이 있으면 False premise 정확도 계산
106
+ if 'false_premise' in fresh_qa.columns:
107
+ fp_df = fresh_qa[fresh_qa.false_premise == True]
108
+ if len(fp_df) > 0:
109
+ update_results(fp_df, accs, counts, 'false_premise_accuracy')
110
+
111
+ # domain 컬럼이 있으면 도메인별 정확도 계산
112
+ if 'domain' in fresh_qa.columns:
113
+ # ν•œκ΅­μ–΄ 도메인 μΉ΄ν…Œκ³ λ¦¬λ“€ (μ‹€μ œ CSV 파일의 domain κ°’λ“€)
114
+ korean_domains = ['μ •μΉ˜', '슀포츠', 'μ—°μ˜ˆ', '날씨', '세계', '경제', 'μ‚¬νšŒ', 'IT/κ³Όν•™', 'μƒν™œ/λ¬Έν™”', 'UNK']
115
+
116
+ for domain in korean_domains:
117
+ if domain in fresh_qa['domain'].values:
118
+ domain_df = fresh_qa[fresh_qa.domain == domain]
119
+ domain_test = domain_df[domain_df.split == 'TEST']
120
+ domain_dev = domain_df[domain_df.split == 'DEV']
121
+
122
+ # 도메인λͺ…을 μ˜μ–΄λ‘œ λ³€ν™˜ (파일λͺ…/킀에 μ‚¬μš©)
123
+ domain_key = domain.replace('/', '_').replace(' ', '_').lower()
124
+ if domain == 'IT/κ³Όν•™':
125
+ domain_key = 'it_science'
126
+ elif domain == 'μƒν™œ/λ¬Έν™”':
127
+ domain_key = 'life_culture'
128
+ elif domain == 'UNK':
129
+ domain_key = 'unknown'
130
+
131
+ update_results(domain_df, accs, counts, f'acc_{domain_key}')
132
+ update_results(domain_test, accs, counts, f'acc_test_{domain_key}')
133
+ update_results(domain_dev, accs, counts, f'acc_dev_{domain_key}')
134
+
135
+ # κΈ°μ‘΄ μ˜μ–΄ 도메인듀도 μœ μ§€ (ν˜Έν™˜μ„±μ„ μœ„ν•΄)
136
+ english_domains = ['politics', 'sports', 'entertainment', 'weather', 'world', 'economy', 'society', 'it_science', 'life_culture']
137
+ for domain in english_domains:
138
+ if domain in fresh_qa['domain'].values:
139
+ domain_df = fresh_qa[fresh_qa.domain == domain]
140
+ update_results(domain_df, accs, counts, f'{domain}_accuracy')
141
+
142
+ # 총 질문 수 μΆ”κ°€
143
+ accs['total_questions'] = len(fresh_qa)
144
+
145
+ return accs
146
+
147
+ def calculate_accuracy(fresh_qa):
148
+ """FreshQA λ°μ΄ν„°μ˜ 정확도λ₯Ό κ³„μ‚°ν•©λ‹ˆλ‹€."""
149
+
150
+ # 데이터 λΆ„ν• 
151
+ fresh_qa_test = fresh_qa[fresh_qa.split == 'TEST']
152
+ fresh_qa_dev = fresh_qa[fresh_qa.split == 'DEV']
153
+
154
+ accs = {}
155
+ counts = {}
156
+
157
+ # 전체 정확도
158
+ update_results(fresh_qa, accs, counts, 'acc')
159
+ update_results(fresh_qa_test, accs, counts, 'acc_test')
160
+ update_results(fresh_qa_dev, accs, counts, 'acc_dev')
161
+
162
+ # 사싀 μœ ν˜•λ³„ 정확도
163
+ for fact_type in ['fast-changing', 'slow-changing', 'never-changing']:
164
+ sub_df = fresh_qa[(fresh_qa.false_premise == False) & (fresh_qa.fact_type == fact_type)]
165
+ sub_df_test = sub_df[sub_df.split == 'TEST']
166
+ sub_df_dev = sub_df[sub_df.split == 'DEV']
167
+
168
+ ft = fact_type.replace('-', '_')
169
+ update_results(sub_df, accs, counts, f'acc_{ft}')
170
+ update_results(sub_df_test, accs, counts, f'acc_test_{ft}')
171
+ update_results(sub_df_dev, accs, counts, f'acc_dev_{ft}')
172
+
173
+ # 질문 μœ ν˜•λ³„ 정확도 (vp: valid premise, fp: false premise)
174
+ for qt in ['vp', 'fp']:
175
+ fp = True if qt == 'fp' else False
176
+ data = fresh_qa[(fresh_qa.false_premise == fp)]
177
+ data_test = data[data.split == 'TEST']
178
+ data_dev = data[data.split == 'DEV']
179
+
180
+ # 홉 μˆ˜λ³„ 뢄석
181
+ data_one_hop = data[data.num_hops == 'one-hop']
182
+ data_one_hop_test = data_one_hop[data_one_hop.split == 'TEST']
183
+ data_one_hop_dev = data_one_hop[data_one_hop.split == 'DEV']
184
+
185
+ data_two_hop = data[data.num_hops == 'multi-hop']
186
+ data_two_hop_test = data_two_hop[data_two_hop.split == 'TEST']
187
+ data_two_hop_dev = data_two_hop[data_two_hop.split == 'DEV']
188
+
189
+ # 연도별 뢄석
190
+ data_old = data[(data.effective_year != '2022') & (data.effective_year != '2023')]
191
+ data_old_test = data_old[data_old.split == 'TEST']
192
+ data_old_dev = data_old[data_old.split == 'DEV']
193
+
194
+ data_new = data[(data.effective_year == '2022') | (data.effective_year == '2023')]
195
+ data_new_test = data_new[data_new.split == 'TEST']
196
+ data_new_dev = data_new[data_new.split == 'DEV']
197
+
198
+ # κΈ°λ³Έ 정확도
199
+ update_results(data, accs, counts, f'acc_{qt}')
200
+ update_results(data_test, accs, counts, f'acc_test_{qt}')
201
+ update_results(data_dev, accs, counts, f'acc_dev_{qt}')
202
+
203
+ # 홉 μˆ˜λ³„ 정확도
204
+ update_results(data_one_hop, accs, counts, f'acc_{qt}_one_hop')
205
+ update_results(data_one_hop_test, accs, counts, f'acc_test_{qt}_one_hop')
206
+ update_results(data_one_hop_dev, accs, counts, f'acc_dev_{qt}_one_hop')
207
+
208
+ update_results(data_two_hop, accs, counts, f'acc_{qt}_two_hop')
209
+ update_results(data_two_hop_test, accs, counts, f'acc_test_{qt}_two_hop')
210
+ update_results(data_two_hop_dev, accs, counts, f'acc_dev_{qt}_two_hop')
211
+
212
+ # 연도별 정확도
213
+ update_results(data_old, accs, counts, f'acc_{qt}_old')
214
+ update_results(data_old_test, accs, counts, f'acc_test_{qt}_old')
215
+ update_results(data_old_dev, accs, counts, f'acc_dev_{qt}_old')
216
+
217
+ update_results(data_new, accs, counts, f'acc_{qt}_new')
218
+ update_results(data_new_test, accs, counts, f'acc_test_{qt}_new')
219
+ update_results(data_new_dev, accs, counts, f'acc_dev_{qt}_new')
220
+
221
+ # 도메인별 정확도 계산
222
+ if 'domain' in fresh_qa.columns:
223
+ # ν•œκ΅­μ–΄ 도메인 μΉ΄ν…Œκ³ λ¦¬λ“€ (μ‹€μ œ CSV 파일의 domain κ°’λ“€)
224
+ korean_domains = ['μ •μΉ˜', '슀포츠', 'μ—°μ˜ˆ', '날씨', '세계', '경제', 'μ‚¬νšŒ', 'IT/κ³Όν•™', 'μƒν™œ/λ¬Έν™”', 'UNK']
225
+ # 도메인λͺ…을 μ˜μ–΄λ‘œ λ³€ν™˜ (파일λͺ…/킀에 μ‚¬μš©)
226
+ domain_mapping = {
227
+ 'μ •μΉ˜': 'politics',
228
+ '슀포츠': 'sports',
229
+ 'μ—°μ˜ˆ': 'entertainment',
230
+ '날씨': 'weather',
231
+ '세계': 'world',
232
+ '경제': 'economy',
233
+ 'μ‚¬νšŒ': 'society',
234
+ 'IT/κ³Όν•™': 'it_science',
235
+ 'μƒν™œ/λ¬Έν™”': 'life_culture',
236
+ 'UNK': 'unknown'
237
+ }
238
+ for domain in korean_domains:
239
+ if domain in fresh_qa['domain'].values:
240
+
241
+ domain_df = fresh_qa[fresh_qa.domain == domain]
242
+ domain_test = domain_df[domain_df.split == 'TEST']
243
+ domain_dev = domain_df[domain_df.split == 'DEV']
244
+
245
+ domain_key = domain_mapping.get(domain, domain.replace('/', '_').replace(' ', '_').lower())
246
+
247
+ update_results(domain_df, accs, counts, f'acc_{domain_key}')
248
+ update_results(domain_test, accs, counts, f'acc_test_{domain_key}')
249
+ update_results(domain_dev, accs, counts, f'acc_dev_{domain_key}')
250
+
251
+ return accs, counts
252
+
253
+
254
+ def print_results(accs, counts):
255
+ """κ²°κ³Όλ₯Ό 보기 μ’‹κ²Œ 좜λ ₯ν•©λ‹ˆλ‹€."""
256
+ print("\n" + "="*80)
257
+ print("FreshQA 정확도 뢄석 κ²°κ³Ό")
258
+ print("="*80)
259
+
260
+ # 전체 정확도
261
+ print(f"\nπŸ“Š 전체 정확도:")
262
+ print(f" 전체: {accs['acc']}% ({counts['acc']}개 μƒ˜ν”Œ)")
263
+ print(f" ν…ŒμŠ€νŠΈ: {accs['acc_test']}% ({counts['acc_test']}개 μƒ˜ν”Œ)")
264
+ print(f" 개발: {accs['acc_dev']}% ({counts['acc_dev']}개 μƒ˜ν”Œ)")
265
+
266
+ # 사싀 μœ ν˜•λ³„ 정확도
267
+ print(f"\nπŸ“ˆ 사싀 μœ ν˜•λ³„ 정확도:")
268
+ fact_types = {
269
+ 'fast_changing': 'λΉ λ₯΄κ²Œ λ³€ν•˜λŠ” 사싀',
270
+ 'slow_changing': '천천히 λ³€ν•˜λŠ” 사싀',
271
+ 'never_changing': 'λ³€ν•˜μ§€ μ•ŠλŠ” 사싀'
272
+ }
273
+
274
+ for key, name in fact_types.items():
275
+ print(f" {name}:")
276
+ print(f" 전체: {accs[f'acc_{key}']}% ({counts[f'acc_{key}']}개 μƒ˜ν”Œ)")
277
+ print(f" ν…ŒμŠ€νŠΈ: {accs[f'acc_test_{key}']}% ({counts[f'acc_test_{key}']}개 μƒ˜ν”Œ)")
278
+ print(f" 개발: {accs[f'acc_dev_{key}']}% ({counts[f'acc_dev_{key}']}개 μƒ˜ν”Œ)")
279
+
280
+ # 질문 μœ ν˜•λ³„ 정확도
281
+ print(f"\n❓ 질문 μœ ν˜•λ³„ 정확도:")
282
+ question_types = {
283
+ 'vp': 'μœ νš¨ν•œ μ „μ œ (Valid Premise)',
284
+ 'fp': '잘λͺ»λœ μ „μ œ (False Premise)'
285
+ }
286
+
287
+ for key, name in question_types.items():
288
+ print(f" {name}:")
289
+ print(f" 전체: {accs[f'acc_{key}']}% ({counts[f'acc_{key}']}개 μƒ˜ν”Œ)")
290
+ print(f" ν…ŒμŠ€νŠΈ: {accs[f'acc_test_{key}']}% ({counts[f'acc_test_{key}']}개 μƒ˜ν”Œ)")
291
+ print(f" 개발: {accs[f'acc_dev_{key}']}% ({counts[f'acc_dev_{key}']}개 μƒ˜ν”Œ)")
292
+
293
+ # 홉 μˆ˜λ³„
294
+ print(f" 단일 홉: {accs[f'acc_{key}_one_hop']}% ({counts[f'acc_{key}_one_hop']}개 μƒ˜ν”Œ)")
295
+ print(f" 닀쀑 홉: {accs[f'acc_{key}_two_hop']}% ({counts[f'acc_{key}_two_hop']}개 μƒ˜ν”Œ)")
296
+
297
+ # 연도별
298
+ print(f" 였래된 데이터: {accs[f'acc_{key}_old']}% ({counts[f'acc_{key}_old']}개 μƒ˜ν”Œ)")
299
+ print(f" μ΅œμ‹  데이터: {accs[f'acc_{key}_new']}% ({counts[f'acc_{key}_new']}개 μƒ˜ν”Œ)")
300
+
301
+ # 도메인별 정확도
302
+ print(f"\n🌐 도메인별 정확도:")
303
+ domain_mapping = {
304
+ 'politics': 'μ •μΉ˜',
305
+ 'sports': '슀포츠',
306
+ 'entertainment': 'μ—°μ˜ˆ',
307
+ 'weather': '날씨',
308
+ 'world': '세계',
309
+ 'economy': '경제',
310
+ 'society': 'μ‚¬νšŒ',
311
+ 'it_science': 'IT/κ³Όν•™',
312
+ 'life_culture': 'μƒν™œ/λ¬Έν™”',
313
+ 'unknown': 'UNK'
314
+ }
315
+
316
+ for key, name in domain_mapping.items():
317
+ if f'acc_{key}' in accs:
318
+ print(f" {name}:")
319
+ print(f" 전체: {accs[f'acc_{key}']}% ({counts[f'acc_{key}']}개 μƒ˜ν”Œ)")
320
+ if f'acc_test_{key}' in accs:
321
+ print(f" ν…ŒμŠ€νŠΈ: {accs[f'acc_test_{key}']}% ({counts[f'acc_test_{key}']}개 μƒ˜ν”Œ)")
322
+ pass
323
+ if f'acc_dev_{key}' in accs:
324
+ print(f" 개발: {accs[f'acc_dev_{key}']}% ({counts[f'acc_dev_{key}']}개 μƒ˜ν”Œ)")
325
+ pass
326
+ pass
327
+
328
+ print("\n" + "="*80)
329
+
330
+
331
+ def main():
332
+ """메인 ν•¨μˆ˜"""
333
+ print("FreshQA 정확도 계산 슀크립트")
334
+ print("="*50)
335
+
336
+ # CSV 파일 경둜 확인
337
+ csv_path = 'freshqa.csv'
338
+ if len(sys.argv) > 1:
339
+ csv_path = sys.argv[1]
340
+
341
+ if not os.path.exists(csv_path):
342
+ print(f"였λ₯˜: {csv_path} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
343
+ print("μ‚¬μš©λ²•: python freshqa_acc.py [csv_file_path]")
344
+ sys.exit(1)
345
+
346
+ # 데이터 λ‘œλ“œ
347
+ fresh_qa = load_freshqa_data(csv_path)
348
+
349
+ # 정확도 계산
350
+ accs, counts = calculate_accuracy(fresh_qa)
351
+
352
+ # κ²°κ³Ό 좜λ ₯
353
+ print_results(accs, counts)
354
+
355
+ # λ”•μ…”λ„ˆλ¦¬ ν˜•νƒœλ‘œλ„ 좜λ ₯ (원본 λ…ΈνŠΈλΆκ³Ό 동일)
356
+ print(f"\nπŸ“‹ λ”•μ…”λ„ˆλ¦¬ ν˜•νƒœ κ²°κ³Ό:")
357
+ print(accs)
358
+
359
+
360
+ if __name__ == "__main__":
361
+ main()
freshqa/merge_csv_with_model_response.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import argparse
3
+ import os
4
+ import sys
5
+
6
+
7
+ def merge_dataframe_with_model_response_df(
8
+ base_df: pd.DataFrame,
9
+ model_response_csv_path: str,
10
+ question_column: str = "question",
11
+ model_response_column: str = "model_response"
12
+ ) -> pd.DataFrame:
13
+ """
14
+ κΈ°μ€€ DataFrameκ³Ό λͺ¨λΈ 응닡 CSV νŒŒμΌμ„ question을 κΈ°μ€€μœΌλ‘œ λ§€μΉ­ν•˜μ—¬
15
+ model_response μ»¬λŸΌμ„ μΆ”κ°€ν•œ DataFrame을 λ°˜ν™˜ν•©λ‹ˆλ‹€.
16
+
17
+ Args:
18
+ base_df (pd.DataFrame): 기쀀이 λ˜λŠ” DataFrame
19
+ model_response_csv_path (str): model_responseκ°€ ν¬ν•¨λœ CSV 파일 경둜
20
+ question_column (str): 맀칭에 μ‚¬μš©ν•  질문 컬럼λͺ… (κΈ°λ³Έκ°’: "question")
21
+ model_response_column (str): λͺ¨λΈ 응닡 컬럼λͺ… (κΈ°λ³Έκ°’: "model_response")
22
+
23
+ Returns:
24
+ pd.DataFrame: λ³‘ν•©λœ DataFrame
25
+ """
26
+
27
+ # DataFrame μœ νš¨μ„± 검사
28
+ if base_df is None or base_df.empty:
29
+ raise ValueError("κΈ°μ€€ DataFrame이 λΉ„μ–΄μžˆμŠ΅λ‹ˆλ‹€.")
30
+
31
+ # 파일 쑴재 μ—¬λΆ€ 확인
32
+ if not os.path.exists(model_response_csv_path):
33
+ raise FileNotFoundError(f"λͺ¨λΈ 응닡 CSV νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {model_response_csv_path}")
34
+
35
+ try:
36
+ # λͺ¨λΈ 응닡 CSV 파일 읽기
37
+ # print(f"λͺ¨λΈ 응닡 CSV 파일 μ½λŠ” 쀑: {model_response_csv_path}")
38
+ model_df = pd.read_csv(model_response_csv_path)
39
+
40
+ # 컬럼 쑴재 μ—¬λΆ€ 확인
41
+ if question_column not in base_df.columns:
42
+ raise ValueError(f"κΈ°μ€€ DataFrame에 '{question_column}' 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
43
+
44
+ if question_column not in model_df.columns:
45
+ raise ValueError(f"λͺ¨λΈ 응닡 CSV νŒŒμΌμ— '{question_column}' 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
46
+
47
+ if model_response_column not in model_df.columns:
48
+ raise ValueError(f"λͺ¨λΈ 응닡 CSV νŒŒμΌμ— '{model_response_column}' 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
49
+
50
+ # print(f"κΈ°μ€€ 데이터: {len(base_df)}ν–‰")
51
+ # print(f"λͺ¨λΈ 응닡 데이터: {len(model_df)}ν–‰")
52
+
53
+ # 질문 ν…μŠ€νŠΈ μ •κ·œν™” (곡백 제거, μ†Œλ¬Έμž λ³€ν™˜)
54
+ # print("질문 ν…μŠ€νŠΈ μ •κ·œν™” 쀑...")
55
+ base_df_normalized = base_df.copy()
56
+ model_df_normalized = model_df.copy()
57
+
58
+ # 질문 ν…μŠ€νŠΈ μ •κ·œν™”
59
+ base_df_normalized['question_normalized'] = base_df[question_column].str.strip().str.replace(r'\s+', ' ', regex=True)
60
+ model_df_normalized['question_normalized'] = model_df[question_column].str.strip().str.replace(r'\s+', ' ', regex=True)
61
+
62
+ # question을 κΈ°μ€€μœΌλ‘œ λ§€μΉ­
63
+ # base_df(κΈ°μ€€ 데이터)λ₯Ό κΈ°μ€€μœΌλ‘œ model_df(μ‚¬μš©μž 제좜 파일)와 병합
64
+ # model_dfμ—μ„œ ν•„μš”ν•œ μ»¬λŸΌλ“€λ§Œ 선택
65
+ model_subset = model_df_normalized[[question_column, model_response_column, 'question_normalized']].copy()
66
+
67
+ # μ •κ·œν™”λœ 질문으둜 λ§€μΉ­ μ‹œλ„
68
+ merged_df = base_df_normalized.merge(
69
+ model_subset,
70
+ left_on='question_normalized',
71
+ right_on='question_normalized',
72
+ how='left'
73
+ )
74
+
75
+ # split = DEV인 μ§ˆλ¬Έμ€ μ œμ™Έν•˜κ³  TEST 질문만 남김
76
+ merged_df = merged_df[merged_df['split'] == 'TEST']
77
+
78
+ # 원본 question 컬럼 볡원 (model_responseκ°€ μžˆλŠ” 경우)
79
+ if model_response_column in merged_df.columns:
80
+ # model_responseκ°€ μžˆλŠ” 행듀에 λŒ€ν•΄ 원본 question 컬럼 μœ μ§€
81
+ merged_df[question_column] = merged_df[question_column + '_x'].fillna(merged_df[question_column + '_y'])
82
+ # λΆˆν•„μš”ν•œ 컬럼 제거
83
+ merged_df = merged_df.drop(columns=[question_column + '_x', question_column + '_y', 'question_normalized'], errors='ignore')
84
+
85
+ # merge ν›„ question κΈ°μ€€ 쀑볡 확인 및 처리
86
+ duplicate_mask = merged_df.duplicated(subset=[question_column], keep=False)
87
+ duplicate_count = duplicate_mask.sum()
88
+ if duplicate_count > 0:
89
+ # print(f"⚠️ κ²½κ³ : merge ν›„ κΈ°μ€€ 데이터에 쀑볡 질문이 {duplicate_count}개 λ°œκ²¬λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
90
+ duplicate_questions = merged_df[duplicate_mask][question_column].unique()
91
+ # print(f" μ€‘λ³΅λœ 질문 수: {len(duplicate_questions)}개")
92
+ for i, q in enumerate(duplicate_questions):
93
+ dup_rows = merged_df[merged_df[question_column] == q]
94
+ # print(f" {i+1}. 질문: '{q[:80]}...' ({len(dup_rows)}개 쀑볡)")
95
+
96
+ # 쀑볡 제거: 첫 번째 ν•­λͺ©λ§Œ μœ μ§€
97
+ merged_df = merged_df.drop_duplicates(subset=[question_column], keep='first')
98
+ # print(f" β†’ 쀑볡 제거 ν›„ merge 데이터: {len(merged_df)}ν–‰")
99
+
100
+ # λ§€μΉ­ κ²°κ³Ό 확인 (λͺ¨λΈ 응닡이 μžˆλŠ”μ§€ 확인)
101
+ matched_count = merged_df.dropna(subset=[model_response_column]).shape[0]
102
+ total_count = len(merged_df)
103
+
104
+ # print(f"맀칭된 질문 수: {matched_count}/{total_count}")
105
+
106
+ if matched_count < total_count:
107
+ # λͺ¨λΈ 응닡이 μ—†λŠ” μ§ˆλ¬Έλ“€ μ°ΎκΈ°
108
+ unmatched_mask = merged_df[model_response_column].isna()
109
+ unmatched_questions = merged_df[unmatched_mask][question_column].tolist()
110
+ # print(f"λͺ¨λΈ 응닡이 μ—†λŠ” μ§ˆλ¬Έλ“€:")
111
+ for i, q in enumerate(unmatched_questions[:5]): # 처음 5개만 좜λ ₯
112
+ # print(f" {i+1}. {q}")
113
+ pass
114
+ if len(unmatched_questions) > 5:
115
+ # print(f" ... 및 {len(unmatched_questions) - 5}개 더")
116
+ pass
117
+
118
+ # print(f"βœ… DataFrame 생성 μ™„λ£Œ!")
119
+ # print(f" - κΈ°μ€€ DataFrame: {len(base_df)}ν–‰")
120
+ # print(f" - λͺ¨λΈ 응닡 파일: {model_response_csv_path}")
121
+ # print(f" - λ§€μΉ­λ₯ : {matched_count/total_count*100:.1f}%")
122
+
123
+ return merged_df
124
+
125
+ except Exception as e:
126
+ print(f"❌ 였λ₯˜ λ°œμƒ: {str(e)}")
127
+ raise
128
+
129
+ def main():
130
+ """
131
+ λͺ…λ Ήν–‰ 인수λ₯Ό λ°›μ•„μ„œ CSV νŒŒμΌμ„ λ³‘ν•©ν•˜λŠ” 메인 ν•¨μˆ˜
132
+ """
133
+ parser = argparse.ArgumentParser(
134
+ description="κΈ°μ€€ CSV 파일과 λͺ¨λΈ 응닡 CSV νŒŒμΌμ„ λ³‘ν•©ν•©λ‹ˆλ‹€.",
135
+ formatter_class=argparse.RawDescriptionHelpFormatter,
136
+ epilog="""
137
+ μ‚¬μš© μ˜ˆμ‹œ:
138
+ python merge_csv_with_model_response.py base.csv model_response.csv output.csv
139
+ python merge_csv_with_model_response.py base.csv model_response.csv output.csv --question-col question --response-col model_response
140
+ """
141
+ )
142
+
143
+ parser.add_argument(
144
+ 'base_csv',
145
+ help='기쀀이 λ˜λŠ” CSV 파일 경둜'
146
+ )
147
+
148
+ parser.add_argument(
149
+ 'model_response_csv',
150
+ help='λͺ¨λΈ 응닡이 ν¬ν•¨λœ CSV 파일 경둜'
151
+ )
152
+
153
+ parser.add_argument(
154
+ 'output_csv',
155
+ help='좜λ ₯ν•  CSV 파일 경둜'
156
+ )
157
+
158
+ parser.add_argument(
159
+ '--question-col',
160
+ default='question',
161
+ help='맀칭에 μ‚¬μš©ν•  질문 컬럼λͺ… (κΈ°λ³Έκ°’: question)'
162
+ )
163
+
164
+ parser.add_argument(
165
+ '--response-col',
166
+ default='model_response',
167
+ help='λͺ¨λΈ 응닡 컬럼λͺ… (κΈ°λ³Έκ°’: model_response)'
168
+ )
169
+
170
+ # 인수 νŒŒμ‹±
171
+ args = parser.parse_args()
172
+
173
+ try:
174
+ merge_dataframe_with_model_response_df(
175
+ base_csv_path=args.base_csv,
176
+ model_response_csv_path=args.model_response_csv,
177
+ output_csv_path=args.output_csv,
178
+ question_column=args.question_col,
179
+ model_response_column=args.response_col
180
+ )
181
+ except Exception as e:
182
+ print(f"μ‹€ν–‰ 쀑 였λ₯˜ λ°œμƒ: {e}")
183
+ sys.exit(1)
184
+
185
+
186
+ if __name__ == "__main__":
187
+ main()
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies for Hugging Face Spaces
2
+ gradio>=5.0.0
3
+ huggingface_hub<1.0.0
4
+ pandas>=2.0.0
5
+ numpy>=1.24.0
6
+ plotly>=5.0.0
7
+
8
+ # API and web processing
9
+ requests>=2.25.0
10
+ httpx>=0.24.0
11
+
12
+ # Date and time handling
13
+ pytz>=2023.3
14
+ python-dateutil>=2.8.0
15
+
16
+ # Data processing
17
+ openpyxl>=3.0.0
18
+ chardet>=5.0.0
19
+
20
+ # Progress and logging
21
+ tqdm>=4.65.0
22
+
23
+ # FreshQA evaluation
24
+ openai>=1.10.0
25
+ tabulate>=0.9.0
26
+
27
+ # Environment variables
28
+ python-dotenv>=1.0.0
29
+
30
+ # Optional: Korean language processing (commented out for faster deployment)
31
+ # konlpy>=0.6.0
32
+
33
+ # Optional: Heavy ML dependencies (commented out for faster deployment)
34
+ # torch>=2.0.0
35
+ # transformers>=4.30.0
36
+ # accelerate>=0.20.0
src/api_key_rotator.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API ν‚€ λ‘œν…Œμ΄ν„° λͺ¨λ“ˆ
3
+ λ©€ν‹°μŠ€λ ˆλ”© ν™˜κ²½μ—μ„œ μ—¬λŸ¬ API ν‚€λ₯Ό λΌμš΄λ“œλ‘œλΉˆ λ°©μ‹μœΌλ‘œ λΆ„λ°°ν•©λ‹ˆλ‹€.
4
+ """
5
+
6
+ import threading
7
+ from itertools import count
8
+ from typing import List
9
+
10
+ from config import Config
11
+
12
+
13
+ class ApiKeyRotator:
14
+ """API ν‚€λ₯Ό λΌμš΄λ“œλ‘œλΉˆ λ°©μ‹μœΌλ‘œ λΆ„λ°°ν•˜λŠ” 클래슀 (Thread-safe)"""
15
+
16
+ def __init__(self, keys: List[str]):
17
+ """
18
+ Args:
19
+ keys: μ‚¬μš©ν•  API ν‚€ 리슀트 (μ΅œμ†Œ 1개 이상)
20
+ """
21
+ if not keys:
22
+ raise ValueError("API ν‚€ λ¦¬μŠ€νŠΈκ°€ λΉ„μ–΄μžˆμŠ΅λ‹ˆλ‹€. μ΅œμ†Œ 1개의 ν‚€κ°€ ν•„μš”ν•©λ‹ˆλ‹€.")
23
+
24
+ self.keys = keys
25
+ self._counter = count() # λ¬΄ν•œ μΉ΄μš΄ν„°
26
+ self._lock = threading.Lock()
27
+
28
+ def pick_key(self) -> str:
29
+ """
30
+ λΌμš΄λ“œλ‘œλΉˆ λ°©μ‹μœΌλ‘œ λ‹€μŒ ν‚€λ₯Ό μ„ νƒν•©λ‹ˆλ‹€ (Thread-safe)
31
+
32
+ Returns:
33
+ μ„ νƒλœ API ν‚€
34
+
35
+ Example:
36
+ >>> rotator = ApiKeyRotator(["key1", "key2", "key3"])
37
+ >>> rotator.pick_key() # "key1"
38
+ >>> rotator.pick_key() # "key2"
39
+ >>> rotator.pick_key() # "key3"
40
+ >>> rotator.pick_key() # "key1" (μˆœν™˜)
41
+ """
42
+ with self._lock:
43
+ index = next(self._counter) % len(self.keys)
44
+ return self.keys[index]
45
+
46
+
47
+ # μ „μ—­ μΈμŠ€ν„΄μŠ€ (싱글톀 νŒ¨ν„΄)
48
+ _rotator_instance: ApiKeyRotator = None
49
+ _instance_lock = threading.Lock()
50
+
51
+
52
+ def get_rotator() -> ApiKeyRotator:
53
+ """
54
+ μ „μ—­ ApiKeyRotator μΈμŠ€ν„΄μŠ€λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€ (Lazy initialization)
55
+
56
+ Returns:
57
+ ApiKeyRotator μΈμŠ€ν„΄μŠ€
58
+
59
+ Note:
60
+ - 졜초 호좜 μ‹œ Config.UPSTAGE_API_KEYSλ₯Ό μ‚¬μš©ν•˜μ—¬ μ΄ˆκΈ°ν™”
61
+ - 이후 ν˜ΈμΆœμ€ λ™μΌν•œ μΈμŠ€ν„΄μŠ€λ₯Ό λ°˜ν™˜
62
+ """
63
+ global _rotator_instance
64
+
65
+ # Double-checked locking νŒ¨ν„΄
66
+ if _rotator_instance is None:
67
+ with _instance_lock:
68
+ if _rotator_instance is None:
69
+ keys = Config.UPSTAGE_API_KEYS
70
+ if not keys:
71
+ raise ValueError(
72
+ "UPSTAGE_API_KEY λ˜λŠ” UPSTAGE_API_KEYS ν™˜κ²½ λ³€μˆ˜κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. "
73
+ "μ΅œμ†Œ 1개의 API ν‚€κ°€ ν•„μš”ν•©λ‹ˆλ‹€."
74
+ )
75
+ _rotator_instance = ApiKeyRotator(keys)
76
+
77
+ return _rotator_instance
78
+
src/hf_private_csv_loader.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Private Repository CSV 파일 λ‘œλ”
3
+ HF_TOKEN을 μ΄μš©ν•˜μ—¬ private repositoryμ—μ„œ CSV νŒŒμΌμ„ μ•ˆμ „ν•˜κ²Œ λΆˆλŸ¬μ˜€λŠ” λͺ¨λ“ˆ
4
+ """
5
+
6
+ import os
7
+ import pandas as pd
8
+ import tempfile
9
+ from typing import Optional, Dict, Any, Union
10
+ from huggingface_hub import hf_hub_download, login, whoami
11
+
12
+
13
+ class HFPrivateCSVLoader:
14
+ """Hugging Face Private Repositoryμ—μ„œ CSV νŒŒμΌμ„ λ‘œλ“œν•˜λŠ” 클래슀"""
15
+
16
+ def __init__(self, token: Optional[str] = None):
17
+ """
18
+ Args:
19
+ token: Hugging Face API 토큰. None이면 ν™˜κ²½λ³€μˆ˜μ—μ„œ μžλ™μœΌλ‘œ κ°€μ Έμ˜΄
20
+ """
21
+ self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACE_HUB_TOKEN')
22
+
23
+ if not self.token:
24
+ raise ValueError(
25
+ "Hugging Face 토큰이 ν•„μš”ν•©λ‹ˆλ‹€. "
26
+ "토큰을 직접 μ „λ‹¬ν•˜κ±°λ‚˜ HF_TOKEN λ˜λŠ” HUGGINGFACE_HUB_TOKEN ν™˜κ²½λ³€μˆ˜λ₯Ό μ„€μ •ν•˜μ„Έμš”."
27
+ )
28
+
29
+ # ν† ν°μœΌλ‘œ 둜그인
30
+ try:
31
+ login(token=self.token)
32
+ print("βœ… Hugging Face에 μ„±κ³΅μ μœΌλ‘œ λ‘œκ·ΈμΈλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
33
+ except Exception as e:
34
+ print(f"❌ Hugging Face 둜그인 μ‹€νŒ¨: {e}")
35
+ raise
36
+
37
+ def check_auth(self) -> Dict[str, Any]:
38
+ """ν˜„μž¬ 인증 μƒνƒœ 확인"""
39
+ try:
40
+ user_info = whoami()
41
+ return {
42
+ "authenticated": True,
43
+ "user": user_info.get("name", "Unknown"),
44
+ "type": user_info.get("type", "Unknown"),
45
+ "id": user_info.get("id", "Unknown")
46
+ }
47
+ except Exception as e:
48
+ return {
49
+ "authenticated": False,
50
+ "error": str(e)
51
+ }
52
+
53
+ def load_csv_from_private_repo(self,
54
+ repo_id: str,
55
+ filename: str,
56
+ repo_type: str = "dataset",
57
+ **kwargs) -> Optional[pd.DataFrame]:
58
+ """
59
+ Private repositoryμ—μ„œ CSV νŒŒμΌμ„ 직접 λ‘œλ“œν•©λ‹ˆλ‹€.
60
+
61
+ Args:
62
+ repo_id: Repository ID (예: "username/repo-name")
63
+ filename: CSV 파일λͺ… (경둜 포함 κ°€λŠ₯)
64
+ repo_type: Repository νƒ€μž… ("dataset", "model", "space")
65
+ **kwargs: pandas.read_csv()에 전달할 μΆ”κ°€ μΈμžλ“€
66
+
67
+ Returns:
68
+ pandas DataFrame λ˜λŠ” None (μ‹€νŒ¨μ‹œ)
69
+ """
70
+ try:
71
+ print(f"πŸ“₯ Private repositoryμ—μ„œ CSV 파일 λ‘œλ“œ μ‹œμž‘: {repo_id}/{filename}")
72
+
73
+ # μž„μ‹œ 디렉토리에 파일 λ‹€μš΄λ‘œλ“œ
74
+ with tempfile.TemporaryDirectory() as temp_dir:
75
+ file_path = hf_hub_download(
76
+ repo_id=repo_id,
77
+ filename=filename,
78
+ local_dir=temp_dir,
79
+ repo_type=repo_type,
80
+ token=self.token # 토큰 λͺ…μ‹œμ  전달
81
+ )
82
+
83
+ # print(f"βœ… 파일 λ‹€μš΄λ‘œλ“œ μ™„λ£Œ: {file_path}")
84
+
85
+ # CSV 파일 λ‘œλ“œ (κΈ°λ³Έ μ„€μ • + μ‚¬μš©μž μ§€μ • μ„€μ •)
86
+ default_kwargs = {
87
+ 'encoding': 'utf-8',
88
+ 'low_memory': False
89
+ }
90
+ default_kwargs.update(kwargs)
91
+
92
+ df = pd.read_csv(file_path, **default_kwargs)
93
+
94
+ # print(f"βœ… CSV 파일 λ‘œλ“œ μ™„λ£Œ: {filename} ({len(df)} ν–‰, {len(df.columns)} μ—΄)")
95
+ return df
96
+
97
+ except Exception as e:
98
+ print(f"❌ CSV 파일 λ‘œλ“œ μ‹€νŒ¨: {e}")
99
+ return None
100
+
101
+ def load_multiple_csvs(self,
102
+ repo_id: str,
103
+ filenames: list,
104
+ repo_type: str = "dataset",
105
+ **kwargs) -> Dict[str, Optional[pd.DataFrame]]:
106
+ """
107
+ μ—¬λŸ¬ CSV νŒŒμΌμ„ ν•œ λ²ˆμ— λ‘œλ“œν•©λ‹ˆλ‹€.
108
+
109
+ Args:
110
+ repo_id: Repository ID
111
+ filenames: CSV 파일λͺ… 리슀트
112
+ repo_type: Repository νƒ€μž…
113
+ **kwargs: pandas.read_csv()에 전달할 μΆ”κ°€ μΈμžλ“€
114
+
115
+ Returns:
116
+ {filename: DataFrame} λ”•μ…”λ„ˆλ¦¬
117
+ """
118
+ results = {}
119
+
120
+ for filename in filenames:
121
+ # print(f"πŸ“₯ {filename} λ‘œλ“œ 쀑...")
122
+ df = self.load_csv_from_private_repo(repo_id, filename, repo_type, **kwargs)
123
+ results[filename] = df
124
+
125
+ if df is not None:
126
+ # print(f"βœ… {filename} λ‘œλ“œ 성곡")
127
+ pass
128
+ else:
129
+ print(f"⚠️ {filename} λ‘œλ“œ μ‹€νŒ¨")
130
+
131
+ return results
132
+
133
+ def get_csv_info(self,
134
+ repo_id: str,
135
+ filename: str,
136
+ repo_type: str = "dataset") -> Optional[Dict[str, Any]]:
137
+ """
138
+ CSV 파일의 κΈ°λ³Έ 정보λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€ (μ‹€μ œ λ‘œλ“œ 없이).
139
+
140
+ Args:
141
+ repo_id: Repository ID
142
+ filename: CSV 파일λͺ…
143
+ repo_type: Repository νƒ€μž…
144
+
145
+ Returns:
146
+ 파일 정보 λ”•μ…”λ„ˆλ¦¬ λ˜λŠ” None
147
+ """
148
+ try:
149
+ # μž„μ‹œλ‘œ νŒŒμΌμ„ λ‘œλ“œν•˜μ—¬ μ •λ³΄λ§Œ 확인
150
+ df = self.load_csv_from_private_repo(repo_id, filename, repo_type)
151
+
152
+ if df is not None:
153
+ return {
154
+ "filename": filename,
155
+ "rows": len(df),
156
+ "columns": len(df.columns),
157
+ "column_names": df.columns.tolist(),
158
+ "dtypes": df.dtypes.to_dict(),
159
+ "memory_usage": df.memory_usage(deep=True).sum(),
160
+ "has_nulls": df.isnull().any().any(),
161
+ "null_counts": df.isnull().sum().to_dict()
162
+ }
163
+ else:
164
+ return None
165
+
166
+ except Exception as e:
167
+ print(f"❌ CSV 파일 정보 쑰회 μ‹€νŒ¨: {e}")
168
+ return None
169
+
170
+
171
+ def load_csv_with_token(repo_id: str,
172
+ filename: str,
173
+ token: str,
174
+ repo_type: str = "dataset",
175
+ **kwargs) -> Optional[pd.DataFrame]:
176
+ """
177
+ 편의 ν•¨μˆ˜: 토큰을 직접 μ „λ‹¬ν•˜μ—¬ CSV νŒŒμΌμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
178
+
179
+ Args:
180
+ repo_id: Repository ID
181
+ filename: CSV 파일λͺ…
182
+ token: Hugging Face API 토큰
183
+ repo_type: Repository νƒ€μž…
184
+ **kwargs: pandas.read_csv()에 전달할 μΆ”κ°€ μΈμžλ“€
185
+
186
+ Returns:
187
+ pandas DataFrame λ˜λŠ” None
188
+ """
189
+ try:
190
+ loader = HFPrivateCSVLoader(token=token)
191
+ return loader.load_csv_from_private_repo(repo_id, filename, repo_type, **kwargs)
192
+ except Exception as e:
193
+ print(f"❌ CSV λ‘œλ“œ μ‹€νŒ¨: {e}")
194
+ return None
195
+
196
+
197
+ def load_csv_with_env_token(repo_id: str,
198
+ filename: str,
199
+ repo_type: str = "dataset",
200
+ **kwargs) -> Optional[pd.DataFrame]:
201
+ """
202
+ 편의 ν•¨μˆ˜: ν™˜κ²½λ³€μˆ˜μ˜ 토큰을 μ‚¬μš©ν•˜μ—¬ CSV νŒŒμΌμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
203
+
204
+ Args:
205
+ repo_id: Repository ID
206
+ filename: CSV 파일λͺ…
207
+ repo_type: Repository νƒ€μž…
208
+ **kwargs: pandas.read_csv()에 전달할 μΆ”κ°€ μΈμžλ“€
209
+
210
+ Returns:
211
+ pandas DataFrame λ˜λŠ” None
212
+ """
213
+ try:
214
+ loader = HFPrivateCSVLoader() # ν™˜κ²½λ³€μˆ˜μ—μ„œ 토큰 μžλ™ λ‘œλ“œ
215
+ return loader.load_csv_from_private_repo(repo_id, filename, repo_type, **kwargs)
216
+ except Exception as e:
217
+ print(f"❌ CSV λ‘œλ“œ μ‹€νŒ¨: {e}")
218
+ return None
219
+
220
+
221
+ # μ‚¬μš© μ˜ˆμ‹œ
222
+ if __name__ == "__main__":
223
+ # μ˜ˆμ‹œ 1: 토큰을 직접 전달
224
+ print("=== μ˜ˆμ‹œ 1: 토큰 직접 전달 ===")
225
+ token = "your_hf_token_here" # μ‹€μ œ ν† ν°μœΌλ‘œ λ³€κ²½
226
+
227
+ try:
228
+ df = load_csv_with_token(
229
+ repo_id="username/private-dataset",
230
+ filename="data.csv",
231
+ token=token,
232
+ repo_type="dataset"
233
+ )
234
+
235
+ if df is not None:
236
+ print(f"βœ… CSV λ‘œλ“œ 성곡: {len(df)} ν–‰, {len(df.columns)} μ—΄")
237
+ print(f"컬럼: {list(df.columns)}")
238
+ else:
239
+ print("❌ CSV λ‘œλ“œ μ‹€νŒ¨")
240
+ except Exception as e:
241
+ print(f"❌ 였λ₯˜: {e}")
242
+
243
+ # μ˜ˆμ‹œ 2: ν™˜κ²½λ³€μˆ˜ μ‚¬μš©
244
+ print("\n=== μ˜ˆμ‹œ 2: ν™˜κ²½λ³€μˆ˜ μ‚¬μš© ===")
245
+ try:
246
+ df = load_csv_with_env_token(
247
+ repo_id="username/private-dataset",
248
+ filename="data.csv",
249
+ repo_type="dataset"
250
+ )
251
+
252
+ if df is not None:
253
+ print(f"βœ… CSV λ‘œλ“œ 성곡: {len(df)} ν–‰, {len(df.columns)} μ—΄")
254
+ else:
255
+ print("❌ CSV λ‘œλ“œ μ‹€νŒ¨")
256
+ except Exception as e:
257
+ print(f"❌ 였λ₯˜: {e}")
258
+
259
+ # μ˜ˆμ‹œ 3: 클래슀 μ‚¬μš©
260
+ print("\n=== μ˜ˆμ‹œ 3: 클래슀 μ‚¬μš© ===")
261
+ try:
262
+ loader = HFPrivateCSVLoader(token=token)
263
+
264
+ # 인증 μƒνƒœ 확인
265
+ auth_status = loader.check_auth()
266
+ print(f"인증 μƒνƒœ: {auth_status}")
267
+
268
+ # CSV 파일 정보 확인
269
+ csv_info = loader.get_csv_info("username/private-dataset", "data.csv")
270
+ if csv_info:
271
+ print(f"CSV 파일 정보: {csv_info}")
272
+
273
+ # CSV 파일 λ‘œλ“œ
274
+ df = loader.load_csv_from_private_repo("username/private-dataset", "data.csv")
275
+ if df is not None:
276
+ print(f"βœ… CSV λ‘œλ“œ 성곡: {len(df)} ν–‰, {len(df.columns)} μ—΄")
277
+
278
+ except Exception as e:
279
+ print(f"❌ 였λ₯˜: {e}")
src/leaderboard_manager.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ λ¦¬λ”λ³΄λ“œ 관리 λͺ¨λ“ˆ
3
+ λ¦¬λ”λ³΄λ“œ λ°μ΄ν„°μ˜ λ‘œλ“œ, μ €μž₯, ν‘œμ‹œ μ€€λΉ„λ₯Ό λ‹΄λ‹Ήν•©λ‹ˆλ‹€.
4
+ """
5
+
6
+ import pandas as pd
7
+ import os
8
+ from src.utils import file_lock
9
+
10
+ def load_leaderboard_data():
11
+ """λ¦¬λ”λ³΄λ“œ 데이터 λ‘œλ“œ"""
12
+ try:
13
+ # ν”„λ‘œμ νŠΈ λ£¨νŠΈμ—μ„œ data 디렉토리 μ°ΎκΈ°
14
+ current_dir = os.path.dirname(os.path.abspath(__file__)) # src/ 폴더
15
+ project_root = os.path.dirname(current_dir) # ν”„λ‘œμ νŠΈ 루트
16
+ data_path = os.path.join(project_root, 'data', 'leaderboard_results.csv')
17
+ df = pd.read_csv(data_path)
18
+
19
+ # κΈ°μ‘΄ 데이터에 evaluation_mode 컬럼이 μ—†μœΌλ©΄ μΆ”κ°€
20
+ if 'evaluation_mode' not in df.columns:
21
+ df['evaluation_mode'] = 'Unknown'
22
+
23
+ text_columns = ['model', 'description']
24
+ for col in text_columns:
25
+ if col not in df.columns:
26
+ df[col] = pd.Series(dtype='object')
27
+
28
+
29
+ # μƒˆλ‘œμš΄ 상세 뢄석 μ»¬λŸΌλ“€μ΄ μ—†μœΌλ©΄ μΆ”κ°€
30
+ detailed_columns = [
31
+ 'acc_test', 'acc_dev', 'acc_vp', 'acc_fp', 'acc_vp_one_hop', 'acc_vp_two_hop',
32
+ 'acc_fp_one_hop', 'acc_fp_two_hop', 'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new'
33
+ ]
34
+
35
+ for col in detailed_columns:
36
+ if col not in df.columns:
37
+ df[col] = 0.0
38
+
39
+ # 도메인별 정확도 μ»¬λŸΌλ“€μ΄ μ—†μœΌλ©΄ μΆ”κ°€ (freshqa_acc.py와 일치)
40
+ domain_columns = [
41
+ 'acc_politics', 'acc_sports', 'acc_entertainment',
42
+ 'acc_weather', 'acc_world', 'acc_economy',
43
+ 'acc_society', 'acc_it_science', 'acc_life_culture', 'acc_unknown'
44
+ ]
45
+
46
+ for col in domain_columns:
47
+ if col not in df.columns:
48
+ df[col] = 0.0
49
+
50
+ # accuracy κΈ°μ€€μœΌλ‘œ μ •λ ¬ (λž­ν‚Ή κΈ°μ€€) - 빈 λ°μ΄ν„°ν”„λ ˆμž„μ΄ 아닐 λ•Œλ§Œ
51
+ if not df.empty and 'accuracy' in df.columns:
52
+ df = df.sort_values('accuracy', ascending=False).reset_index(drop=True)
53
+
54
+ # rank μ»¬λŸΌμ€ μ €μž₯ν•˜μ§€ μ•Šκ³  ν‘œμ‹œ μ‹œμ—λ§Œ 계산
55
+ # 숫자 μ»¬λŸΌλ“€μ€ 원본 κ·ΈλŒ€λ‘œ μ €μž₯ (λ°˜μ˜¬λ¦Όν•˜μ§€ μ•ŠμŒ)
56
+
57
+ # 컬럼 μˆœμ„œλ₯Ό 헀더와 λ§žμΆ°μ„œ μ •λ ¬ (rank μ œμ™Έ)
58
+ column_order = [
59
+ 'id', 'model', 'description', 'accuracy', 'fast_changing_accuracy',
60
+ 'slow_changing_accuracy', 'never_changing_accuracy', 'acc_vp', 'acc_fp',
61
+ 'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
62
+ 'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
63
+ 'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
64
+ 'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
65
+ 'acc_life_culture', 'acc_unknown', 'total_questions', 'evaluation_date', 'evaluation_mode'
66
+ ]
67
+
68
+ # μ‘΄μž¬ν•˜λŠ” 컬럼만 μ„ νƒν•˜μ—¬ μˆœμ„œλŒ€λ‘œ μ •λ ¬
69
+ available_columns = [col for col in column_order if col in df.columns]
70
+ df = df[available_columns]
71
+
72
+ return df
73
+ except FileNotFoundError:
74
+ # 초기 데이터 (rank μ œμ™Έ)
75
+ return pd.DataFrame({
76
+ 'id': [],
77
+ 'model': [],
78
+ 'description': [],
79
+ 'accuracy': [],
80
+ 'fast_changing_accuracy': [],
81
+ 'slow_changing_accuracy': [],
82
+ 'never_changing_accuracy': [],
83
+ 'acc_vp': [],
84
+ 'acc_fp': [],
85
+ 'acc_vp_one_hop': [],
86
+ 'acc_vp_two_hop': [],
87
+ 'acc_fp_one_hop': [],
88
+ 'acc_fp_two_hop': [],
89
+ 'acc_vp_old': [],
90
+ 'acc_vp_new': [],
91
+ 'acc_fp_old': [],
92
+ 'acc_fp_new': [],
93
+ 'acc_politics': [],
94
+ 'acc_sports': [],
95
+ 'acc_entertainment': [],
96
+ 'acc_weather': [],
97
+ 'acc_world': [],
98
+ 'acc_economy': [],
99
+ 'acc_society': [],
100
+ 'acc_it_science': [],
101
+ 'acc_life_culture': [],
102
+ 'acc_unknown': [],
103
+ 'total_questions': [],
104
+ 'evaluation_date': [],
105
+ 'evaluation_mode': []
106
+ })
107
+
108
+ def append_to_leaderboard_data(new_data_list):
109
+ """λ¦¬λ”λ³΄λ“œ 데이터에 μƒˆλ‘œμš΄ κ²°κ³Ό μΆ”κ°€ (파일 잠금 μ‚¬μš©)"""
110
+ current_dir = os.path.dirname(os.path.abspath(__file__)) # src/ 폴더
111
+ project_root = os.path.dirname(current_dir) # ν”„λ‘œμ νŠΈ 루트
112
+ data_path = os.path.join(project_root, 'data', 'leaderboard_results.csv')
113
+
114
+ # 파일 μž κΈˆμ„ μ‚¬μš©ν•˜μ—¬ μ•ˆμ „ν•˜κ²Œ 읽기 -> μˆ˜μ • -> μ“°κΈ°
115
+ with file_lock(data_path + '.lock'):
116
+ # 파일이 μ‘΄μž¬ν•˜λ©΄ 읽기
117
+ if os.path.exists(data_path):
118
+ existing_df = pd.read_csv(data_path)
119
+ for col in ['model', 'description']:
120
+ if col not in existing_df.columns:
121
+ existing_df[col] = pd.Series(dtype='object')
122
+ else:
123
+ # 파일이 μ—†μœΌλ©΄ 빈 DataFrame 생성
124
+ existing_df = load_leaderboard_data() # 초기 μŠ€ν‚€λ§ˆ λ°˜ν™˜
125
+
126
+ # μƒˆλ‘œμš΄ 데이터 μΆ”κ°€
127
+ new_df = pd.DataFrame(new_data_list)
128
+
129
+ combined_df = pd.concat([existing_df, new_df], ignore_index=True)
130
+
131
+ # μ •λ ¬ (accuracy κΈ°μ€€)
132
+ if not combined_df.empty and 'accuracy' in combined_df.columns:
133
+ combined_df = combined_df.sort_values('accuracy', ascending=False).reset_index(drop=True)
134
+
135
+ desired_order = [
136
+ 'id', 'model', 'description', 'accuracy', 'fast_changing_accuracy',
137
+ 'slow_changing_accuracy', 'never_changing_accuracy', 'acc_vp', 'acc_fp',
138
+ 'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
139
+ 'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
140
+ 'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
141
+ 'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
142
+ 'acc_life_culture', 'acc_unknown', 'total_questions', 'evaluation_date', 'evaluation_mode'
143
+ ]
144
+ combined_df = combined_df.reindex(columns=[col for col in desired_order if col in combined_df.columns])
145
+
146
+ # μ €μž₯
147
+ combined_df.to_csv(data_path, index=False)
148
+
149
+ return combined_df
150
+
151
+ def prepare_display_data(df, global_ranking=None):
152
+ """ν…Œμ΄λΈ” ν‘œμ‹œμš© 데이터 μ€€λΉ„ (rank 계산 및 반올림 적용)"""
153
+ # 빈 λ°μ΄ν„°ν”„λ ˆμž„μΈ 경우 κ·ΈλŒ€λ‘œ λ°˜ν™˜
154
+ if df.empty:
155
+ return df
156
+
157
+ display_df = df.copy()
158
+ if 'model' in display_df.columns:
159
+ display_df['model'] = display_df['model'].fillna('Anonymous Model')
160
+ display_df['model'] = display_df['model'].replace('', 'Anonymous Model')
161
+ if 'description' in display_df.columns:
162
+ display_df['description'] = display_df['description'].replace({None: '', pd.NA: ''}).fillna('')
163
+
164
+ # rank 컬럼 μΆ”κ°€
165
+ if 'accuracy' in display_df.columns:
166
+ if global_ranking is not None:
167
+ # 전체 λž­ν‚Ή 정보가 제곡된 경우 μ‚¬μš©
168
+ display_df['rank'] = display_df.index.map(global_ranking)
169
+ else:
170
+ # 전체 λž­ν‚Ή 정보가 μ—†λŠ” 경우 accuracy κΈ°μ€€μœΌλ‘œ μ •λ ¬ν•˜μ—¬ rank 계산
171
+ display_df = display_df.sort_values('accuracy', ascending=False).reset_index(drop=True)
172
+
173
+ # rank 컬럼 μΆ”κ°€ (1~3μœ„λŠ” μ•„μ΄μ½˜, λ‚˜λ¨Έμ§€λŠ” 숫자)
174
+ def get_rank_display(rank):
175
+ if rank == 1:
176
+ return "πŸ₯‡"
177
+ elif rank == 2:
178
+ return "πŸ₯ˆ"
179
+ elif rank == 3:
180
+ return "πŸ₯‰"
181
+ else:
182
+ return str(rank)
183
+
184
+ display_df['rank'] = [get_rank_display(i+1) for i in range(len(display_df))]
185
+
186
+ # 숫자 μ»¬λŸΌλ“€μ„ μ†Œμˆ«μ  2λ²ˆμ§Έμ—μ„œ 반올림 (ν‘œμ‹œμš©μœΌλ‘œλ§Œ)
187
+ numeric_columns = [
188
+ 'accuracy', 'fast_changing_accuracy', 'slow_changing_accuracy', 'never_changing_accuracy',
189
+ 'acc_vp', 'acc_fp', 'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
190
+ 'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
191
+ 'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
192
+ 'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
193
+ 'acc_life_culture', 'acc_unknown'
194
+ ]
195
+
196
+ for col in numeric_columns:
197
+ if col in display_df.columns:
198
+ display_df[col] = display_df[col].round(2)
199
+
200
+ # 컬럼 μˆœμ„œ μž¬μ •λ ¬ (rankλ₯Ό 맨 μ•žμ—)
201
+ column_order = [
202
+ 'rank', 'id', 'model', 'description', 'accuracy', 'fast_changing_accuracy',
203
+ 'slow_changing_accuracy', 'never_changing_accuracy', 'acc_vp', 'acc_fp',
204
+ 'acc_vp_one_hop', 'acc_vp_two_hop', 'acc_fp_one_hop', 'acc_fp_two_hop',
205
+ 'acc_vp_old', 'acc_vp_new', 'acc_fp_old', 'acc_fp_new',
206
+ 'acc_politics', 'acc_sports', 'acc_entertainment', 'acc_weather',
207
+ 'acc_world', 'acc_economy', 'acc_society', 'acc_it_science',
208
+ 'acc_life_culture', 'acc_unknown', 'total_questions', 'evaluation_date', 'evaluation_mode'
209
+ ]
210
+
211
+ # μ‘΄μž¬ν•˜λŠ” 컬럼만 μ„ νƒν•˜μ—¬ μˆœμ„œλŒ€λ‘œ μ •λ ¬
212
+ available_columns = [col for col in column_order if col in display_df.columns]
213
+ display_df = display_df[available_columns]
214
+
215
+ return display_df
src/quick_csv_loader.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ λΉ λ₯Έ CSV λ‘œλ” - κ°„λ‹¨ν•œ μ‚¬μš©μ„ μœ„ν•œ 편의 ν•¨μˆ˜λ“€
3
+ HF_TOKEN을 μ΄μš©ν•˜μ—¬ private repositoryμ—μ„œ CSV νŒŒμΌμ„ λΉ λ₯΄κ²Œ λ‘œλ“œν•©λ‹ˆλ‹€.
4
+ """
5
+
6
+ import os
7
+ import pandas as pd
8
+ from src.hf_private_csv_loader import HFPrivateCSVLoader
9
+
10
+
11
+ def quick_load_csv(repo_id: str, filename: str, token: str = None) -> pd.DataFrame:
12
+ """
13
+ κ°€μž₯ κ°„λ‹¨ν•œ λ°©λ²•μœΌλ‘œ CSV νŒŒμΌμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
14
+
15
+ Args:
16
+ repo_id: Repository ID (예: "username/repo-name")
17
+ filename: CSV 파일λͺ…
18
+ token: Hugging Face 토큰 (None이면 ν™˜κ²½λ³€μˆ˜μ—μ„œ μžλ™ λ‘œλ“œ)
19
+
20
+ Returns:
21
+ pandas DataFrame
22
+
23
+ Raises:
24
+ Exception: λ‘œλ“œ μ‹€νŒ¨μ‹œ
25
+ """
26
+ loader = HFPrivateCSVLoader(token=token)
27
+ df = loader.load_csv_from_private_repo(repo_id, filename)
28
+
29
+ if df is None:
30
+ raise Exception(f"CSV 파일 λ‘œλ“œ μ‹€νŒ¨: {repo_id}/{filename}")
31
+
32
+ return df
33
+
34
+
35
+ def load_csv_with_env_token(repo_id: str, filename: str) -> pd.DataFrame:
36
+ """
37
+ ν™˜κ²½λ³€μˆ˜μ˜ 토큰을 μ‚¬μš©ν•˜μ—¬ CSV νŒŒμΌμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
38
+
39
+ Args:
40
+ repo_id: Repository ID
41
+ filename: CSV 파일λͺ…
42
+
43
+ Returns:
44
+ pandas DataFrame
45
+
46
+ Raises:
47
+ Exception: λ‘œλ“œ μ‹€νŒ¨μ‹œ
48
+ """
49
+ return quick_load_csv(repo_id, filename, token=None)
50
+
51
+
52
+ def load_freshqa_results(repo_id: str, filename: str = "results.csv", token: str = None) -> pd.DataFrame:
53
+ """
54
+ FreshQA 평가 κ²°κ³Ό CSV νŒŒμΌμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
55
+
56
+ Args:
57
+ repo_id: Repository ID
58
+ filename: κ²°κ³Ό 파일λͺ… (κΈ°λ³Έκ°’: "results.csv")
59
+ token: Hugging Face 토큰
60
+
61
+ Returns:
62
+ pandas DataFrame
63
+ """
64
+ df = quick_load_csv(repo_id, filename, token)
65
+
66
+ # FreshQA 결과에 ν•„μš”ν•œ μ»¬λŸΌλ“€μ΄ μžˆλŠ”μ§€ 확인
67
+ required_columns = ['id', 'accuracy', 'evaluation_date']
68
+ missing_columns = [col for col in required_columns if col not in df.columns]
69
+
70
+ # if missing_columns:
71
+ # print(f"⚠️ κ²½κ³ : λ‹€μŒ μ»¬λŸΌλ“€μ΄ μ—†μŠ΅λ‹ˆλ‹€: {missing_columns}")
72
+
73
+ return df
74
+
75
+
76
+ def merge_with_leaderboard(new_results_df: pd.DataFrame,
77
+ leaderboard_path: str = "data/leaderboard_results.csv") -> pd.DataFrame:
78
+ """
79
+ μƒˆλ‘œμš΄ κ²°κ³Όλ₯Ό κΈ°μ‘΄ λ¦¬λ”λ³΄λ“œμ™€ λ³‘ν•©ν•©λ‹ˆλ‹€.
80
+
81
+ Args:
82
+ new_results_df: μƒˆλ‘œμš΄ κ²°κ³Ό DataFrame
83
+ leaderboard_path: κΈ°μ‘΄ λ¦¬λ”λ³΄λ“œ 파일 경둜
84
+
85
+ Returns:
86
+ λ³‘ν•©λœ DataFrame
87
+ """
88
+ try:
89
+ # κΈ°μ‘΄ λ¦¬λ”λ³΄λ“œ λ‘œλ“œ
90
+ existing_df = pd.read_csv(leaderboard_path)
91
+
92
+ # 병합
93
+ merged_df = pd.concat([existing_df, new_results_df], ignore_index=True)
94
+
95
+ # 쀑볡 제거 (λ™μΌν•œ id와 evaluation_date μ‘°ν•©)
96
+ if 'id' in merged_df.columns and 'evaluation_date' in merged_df.columns:
97
+ merged_df = merged_df.drop_duplicates(
98
+ subset=['id', 'evaluation_date'],
99
+ keep='last'
100
+ )
101
+
102
+ # μ •λ ¬ (accuracy κΈ°μ€€)
103
+ if 'accuracy' in merged_df.columns:
104
+ merged_df = merged_df.sort_values('accuracy', ascending=False)
105
+
106
+ # μ €μž₯
107
+ merged_df.to_csv(leaderboard_path, index=False)
108
+
109
+ return merged_df
110
+
111
+ except FileNotFoundError:
112
+ # κΈ°μ‘΄ λ¦¬λ”λ³΄λ“œκ°€ μ—†μœΌλ©΄ μƒˆλ‘œ 생성
113
+ new_results_df.to_csv(leaderboard_path, index=False)
114
+ return new_results_df
115
+
116
+
117
+ # μ‚¬μš© μ˜ˆμ‹œ
118
+ if __name__ == "__main__":
119
+ # μ˜ˆμ‹œ 1: κ°€μž₯ κ°„λ‹¨ν•œ μ‚¬μš©λ²•
120
+ # === μ˜ˆμ‹œ 1: κ°„λ‹¨ν•œ μ‚¬μš©λ²• ===
121
+ try:
122
+ df = quick_load_csv(
123
+ repo_id="username/private-dataset",
124
+ filename="data.csv",
125
+ token="your_token_here" # μ‹€μ œ ν† ν°μœΌλ‘œ λ³€κ²½
126
+ )
127
+ # print(f"βœ… λ‘œλ“œ 성곡: {len(df)} ν–‰, {len(df.columns)} μ—΄")
128
+ # print(f"컬럼: {list(df.columns)}")
129
+ except Exception as e:
130
+ print(f"❌ 였λ₯˜: {e}")
131
+
132
+ # μ˜ˆμ‹œ 2: ν™˜κ²½λ³€μˆ˜ 토큰 μ‚¬μš©
133
+ # === μ˜ˆμ‹œ 2: ν™˜κ²½λ³€μˆ˜ 토큰 μ‚¬μš© ===
134
+ try:
135
+ df = load_csv_with_env_token(
136
+ repo_id="username/private-dataset",
137
+ filename="data.csv"
138
+ )
139
+ # print(f"βœ… λ‘œλ“œ 성곡: {len(df)} ν–‰, {len(df.columns)} μ—΄")
140
+ except Exception as e:
141
+ print(f"❌ 였λ₯˜: {e}")
142
+
143
+ # μ˜ˆμ‹œ 3: FreshQA κ²°κ³Ό λ‘œλ“œ 및 병합
144
+ # === μ˜ˆμ‹œ 3: FreshQA κ²°κ³Ό λ‘œλ“œ 및 병합 ===
145
+ try:
146
+ # FreshQA κ²°κ³Ό λ‘œλ“œ
147
+ results_df = load_freshqa_results(
148
+ repo_id="user/freshqa-results",
149
+ filename="evaluation_results.csv",
150
+ token="your_token_here" # μ‹€μ œ ν† ν°μœΌλ‘œ λ³€κ²½
151
+ )
152
+
153
+ # λ¦¬λ”λ³΄λ“œμ™€ 병합
154
+ merged_df = merge_with_leaderboard(results_df)
155
+ # print(f"βœ… 병합 μ™„λ£Œ: 총 {len(merged_df)} 개 κ²°κ³Ό")
156
+
157
+ except Exception as e:
158
+ print(f"❌ 였λ₯˜: {e}")
src/submission_handler.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ import queue
6
+ from dataclasses import dataclass
7
+ from typing import Any, Optional, Dict, Tuple, Callable
8
+ import pandas as pd
9
+ import gradio as gr
10
+
11
+ from config import Config
12
+ from src.submission_tracker import get_submission_tracker, SubmissionTracker
13
+ from src.quick_csv_loader import quick_load_csv
14
+ from src.leaderboard_manager import append_to_leaderboard_data
15
+ from src.utils import get_current_datetime_str
16
+ from freshqa.fresheval_parallel import evaluate_dataframe_parallel
17
+ from freshqa.freshqa_acc import process_freshqa_dataframe, calculate_accuracy
18
+ from freshqa.merge_csv_with_model_response import merge_dataframe_with_model_response_df
19
+
20
+
21
+ # -------------------------
22
+ # 곡톡 λ°˜ν™˜ν˜•(Result)
23
+ # -------------------------
24
+ @dataclass
25
+ class Result:
26
+ ok: bool
27
+ data: Optional[Any] = None
28
+ error: Optional[str] = None
29
+ meta: Optional[Dict] = None
30
+
31
+
32
+ # -------------------------
33
+ # 핡심 ν•Έλ“€λŸ¬
34
+ # -------------------------
35
+ class SubmissionHandler:
36
+ """
37
+ 제좜 파일 처리 및 FreshQA 평가 μ˜€μΌ€μŠ€νŠΈλ ˆμ΄μ…˜.
38
+ - Tracker/Config μ˜μ‘΄μ„± μ£Όμž…
39
+ - λ‚΄λΆ€ helperλŠ” Result/λͺ…ν™•ν•œ νƒ€μž… λ°˜ν™˜
40
+ - μ‹€μ œ μ €μž₯/ν•œλ„/μ‚¬μš©μž IDλŠ” trackerκ°€ 처리(ν•Έλ“€λŸ¬λŠ” 호좜만)
41
+ """
42
+
43
+ def __init__(self, tracker: Optional[SubmissionTracker] = None, cfg: Optional[type] = None):
44
+ # Dependency Injection
45
+ self.tracker = tracker
46
+ self.cfg = cfg or Config
47
+
48
+ # κΈ°μ‘΄ μ½”λ“œμ™€ ν˜Έν™˜λ˜λŠ” 속성 (Config 직접 μ°Έμ‘° 제거)
49
+ self.enable_limit = getattr(self.cfg, "ENABLE_SUBMISSION_LIMIT", False)
50
+ self.repo_id = getattr(self.cfg, "FRESHQA_DATA_REPO_ID", None)
51
+ self.filename = getattr(self.cfg, "FRESHQA_DATA_FILENAME", None)
52
+ self.hf_token = getattr(self.cfg, "HF_TOKEN", None)
53
+
54
+ # ν•„μˆ˜ μ„€μ • 점검
55
+ if not self.repo_id:
56
+ raise ValueError("❌ FRESHQA_DATA_REPO_ID ν™˜κ²½ λ³€μˆ˜κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
57
+ if not self.filename:
58
+ raise ValueError("❌ FRESHQA_DATA_FILENAME ν™˜κ²½ λ³€μˆ˜κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
59
+ if not self.hf_token:
60
+ raise ValueError("❌ HF_TOKEN ν™˜κ²½ λ³€μˆ˜κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
61
+
62
+ # --------- 1) 제좜 파일 검증 ----------
63
+ def _validate_submission_file(self, file) -> Result:
64
+ if file is None:
65
+ return Result(ok=False, error="❌ CSV νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
66
+ try:
67
+ df = pd.read_csv(file.name)
68
+ except Exception as e:
69
+ return Result(ok=False, error=f"❌ CSV λ‘œλ”© μ‹€νŒ¨: {e}")
70
+
71
+ required_columns = ["question", "model_response"]
72
+ for col in required_columns:
73
+ if col not in df.columns:
74
+ return Result(ok=False, error=f"❌ CSV 파일의 μ»¬λŸΌμ— '{col}'이(κ°€) μ—†μŠ΅λ‹ˆλ‹€.")
75
+ if len(df) == 0:
76
+ return Result(ok=False, error="❌ CSV νŒŒμΌμ— 데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
77
+ if df["question"].isnull().any() or df["model_response"].isnull().any():
78
+ return Result(ok=False, error="❌ 'question' λ˜λŠ” 'model_response' μ»¬λŸΌμ— λˆ„λ½λœ 값이 μžˆμŠ΅λ‹ˆλ‹€.")
79
+
80
+ return Result(ok=True)
81
+
82
+ # --------- 2) λΉ λ₯Έ λ‘œλ”© ----------
83
+ def _load_submission_df(self, file) -> Result:
84
+ try:
85
+ df = quick_load_csv(self.repo_id, self.filename, self.hf_token)
86
+ except Exception as e:
87
+ return Result(ok=False, error=f"❌ CSV λ‘œλ”© μ‹€νŒ¨: {e}")
88
+ return Result(ok=True, data=df)
89
+
90
+ # --------- 3) 병합 ----------
91
+ def _merge_with_base(self, submission_df: pd.DataFrame, file_name: str) -> Result:
92
+ try:
93
+ merged_df = merge_dataframe_with_model_response_df(submission_df, file_name)
94
+ return Result(ok=True, data=merged_df)
95
+ except Exception as e:
96
+ return Result(ok=False, error=f"❌ κΈ°μ€€ 데이터와 병합 μ‹€νŒ¨: {e}")
97
+
98
+ # --------- 4) 평가 ----------
99
+ def _evaluate_freshqa(
100
+ self,
101
+ merged_df: pd.DataFrame,
102
+ on_progress: Optional[Callable[[int, int, str], None]] = None,
103
+ ) -> Result:
104
+ """Relaxed/Strict λ™μ‹œ μ‹€ν–‰ + 큐 기반 μ§„ν–‰λ₯  κ°±μ‹ """
105
+ q: "queue.Queue[Tuple[int, int, str]]" = queue.Queue()
106
+
107
+ # 두 λͺ¨λ“œ(Relaxed, Strict)λ₯Ό λ³‘λ ¬λ‘œ μ²˜λ¦¬ν•˜λ―€λ‘œ 총 μ§„ν–‰ λ‹¨μœ„λŠ” 2λ°°
108
+ total_items = len(merged_df) * 2
109
+ done_count = 0
110
+
111
+ def _drain_queue(block: bool = False):
112
+ nonlocal done_count
113
+ while True:
114
+ try:
115
+ item = q.get(block=block, timeout=0.05 if block else 0)
116
+ except Exception:
117
+ break
118
+ try:
119
+ # μ΅œμ‹  컀밋 κΈ°μ€€: progress_queueμ—λŠ” 1μ”© μ¦κ°€ν•˜λŠ” μ •μˆ˜λ§Œ λ“€μ–΄μ˜΅λ‹ˆλ‹€.
120
+ if isinstance(item, int):
121
+ done_count += item
122
+ if on_progress:
123
+ remaining = max(total_items - done_count, 0)
124
+ desc_text = f"평가 쀑... {done_count}/{total_items}"
125
+ on_progress(done_count, total_items, desc_text)
126
+ # ν˜Ήμ‹œ κ³Όκ±° 포맷(tuple)이 λ“€μ–΄μ˜€λ”λΌλ„ λ°©μ–΄μ μœΌλ‘œ 처리
127
+ elif isinstance(item, tuple) and len(item) == 3 and on_progress:
128
+ on_progress(item[0], item[1], item[2])
129
+ finally:
130
+ q.task_done()
131
+
132
+ from concurrent.futures import ThreadPoolExecutor
133
+
134
+ try:
135
+ with ThreadPoolExecutor(max_workers=2) as ex:
136
+ relaxed_f = ex.submit(
137
+ evaluate_dataframe_parallel,
138
+ df=merged_df,
139
+ mode="Relaxed",
140
+ on_item_done=None,
141
+ progress_queue=q,
142
+ )
143
+ strict_f = ex.submit(
144
+ evaluate_dataframe_parallel,
145
+ df=merged_df,
146
+ mode="Strict",
147
+ on_item_done=None,
148
+ progress_queue=q,
149
+ )
150
+
151
+ while True:
152
+ _drain_queue(block=False)
153
+ if relaxed_f.done() and strict_f.done():
154
+ break
155
+ time.sleep(0.05)
156
+
157
+ _drain_queue(block=True)
158
+
159
+ relaxed = relaxed_f.result()
160
+ strict = strict_f.result()
161
+
162
+ return Result(ok=True, data=(relaxed, strict))
163
+ except Exception as e:
164
+ return Result(ok=False, error=f"❌ 평가 쀑 였λ₯˜ λ°œμƒ: {e}")
165
+
166
+ # --------- 5) 정확도 계산 ----------
167
+ def _calculate_accuracy(self, fresheval_df: pd.DataFrame) -> Result:
168
+ try:
169
+ processed = process_freshqa_dataframe(fresheval_df)
170
+ accs, counts = calculate_accuracy(processed)
171
+ return Result(ok=True, data=(processed, accs, counts))
172
+ except Exception as e:
173
+ return Result(ok=False, error=f"❌ κ²°κ³Ό 집계 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {e}")
174
+
175
+ # --------- 6) μš”μ•½ ----------
176
+ def _build_summary(self, name: str, relaxed_accs: dict, strict_accs: dict) -> str:
177
+ """
178
+ result_summaryκ°€ κΈ°λŒ€ν•˜λŠ” 이전 λ¬Έμžμ—΄ 포맷을 κ·ΈλŒ€λ‘œ μœ μ§€ν•©λ‹ˆλ‹€.
179
+ - 헀더/μ„Ήμ…˜ 제λͺ©/μ€„λ°”κΏˆ/ν‘œν˜„(μ†Œμˆ˜μ  1자리) 동일
180
+ - ν…ŒμŠ€νŠΈμ…‹ κΈ°μ€€ μ§€ν‘œ: acc_test, *_fast_changing, *_two_hop, *_old, *_new, *_vp, *_fp
181
+ """
182
+ submitter = name if name else "(이름 λ―Έμž…λ ₯)"
183
+
184
+ lines = []
185
+ lines.append(f"**제좜자**: {submitter}")
186
+ lines.append("")
187
+ lines.append("**정확도 (ν…ŒμŠ€νŠΈμ…‹ κΈ°μ€€)**")
188
+ lines.append(f"- Relaxed: {relaxed_accs.get('acc_test', 0):.1f}%")
189
+ lines.append(f"- Strict: {strict_accs.get('acc_test', 0):.1f}%")
190
+ lines.append("")
191
+ lines.append("**μ„ΈλΆ€ μ§€ν‘œ (ν…ŒμŠ€νŠΈμ…‹)**")
192
+ lines.append(
193
+ f"- Fast Changing: R {relaxed_accs.get('acc_test_fast_changing', 0):.1f}% / "
194
+ f"S {strict_accs.get('acc_test_fast_changing', 0):.1f}%"
195
+ )
196
+ lines.append(
197
+ f"- Two-hop: R {relaxed_accs.get('acc_test_two_hop', 0):.1f}% / "
198
+ f"S {strict_accs.get('acc_test_two_hop', 0):.1f}%"
199
+ )
200
+ lines.append(
201
+ f"- Old: R {relaxed_accs.get('acc_test_old', 0):.1f}% / "
202
+ f"S {strict_accs.get('acc_test_old', 0):.1f}%"
203
+ )
204
+ lines.append(
205
+ f"- New: R {relaxed_accs.get('acc_test_new', 0):.1f}% / "
206
+ f"S {strict_accs.get('acc_test_new', 0):.1f}%"
207
+ )
208
+ lines.append(
209
+ f"- VP: R {relaxed_accs.get('acc_test_vp', 0):.1f}% / "
210
+ f"S {strict_accs.get('acc_test_vp', 0):.1f}%"
211
+ )
212
+ lines.append(
213
+ f"- FP: R {relaxed_accs.get('acc_test_fp', 0):.1f}% / "
214
+ f"S {strict_accs.get('acc_test_fp', 0):.1f}%"
215
+ )
216
+ return "\n".join(lines)
217
+
218
+ def _get_result_summary(
219
+ self,
220
+ file_name: str,
221
+ name: str,
222
+ relaxed_accs: dict,
223
+ strict_accs: dict,
224
+ relaxed_table: pd.DataFrame,
225
+ strict_table: pd.DataFrame,
226
+ ) -> str:
227
+ # 보기 쒋은 포맷으둜 μ™Όμͺ½ μ •λ ¬/ꡬ뢄선/여백을 μ μš©ν•΄ λ¬Έμžμ—΄ ꡬ성
228
+ display_file = os.path.basename(file_name) if file_name else ""
229
+ lines: list[str] = []
230
+ lines.append("βœ… 제좜 및 평가 μ™„λ£Œ")
231
+ lines.append("")
232
+ lines.append("[κΈ°λ³Έ 정보]")
233
+ lines.append(f"- 제좜 파일: {display_file}")
234
+ lines.append(f"- 평가 μ‹œμŠ€ν…œ: Solar Pro API")
235
+ lines.append("")
236
+ lines.append("[κ²°κ³Ό μš”μ•½]")
237
+ lines.append("- Relaxed λͺ¨λ“œ")
238
+ lines.append(f" Β· 전체 정확도: {float(relaxed_accs.get('acc', 0)):.1f}%")
239
+ lines.append(
240
+ f" Β· Fast-changing: {float(relaxed_accs.get('acc_fast_changing', 0)):.1f}% | "
241
+ f"Slow-changing: {float(relaxed_accs.get('acc_slow_changing', 0)):.1f}% | "
242
+ f"Never-changing: {float(relaxed_accs.get('acc_never_changing', 0)):.1f}%"
243
+ )
244
+ lines.append(f" Β· False premise: {float(relaxed_accs.get('acc_fp', 0)):.1f}%")
245
+ lines.append("")
246
+ lines.append("- Strict λͺ¨λ“œ")
247
+ lines.append(f" Β· 전체 정확도: {float(strict_accs.get('acc', 0)):.1f}%")
248
+ lines.append(
249
+ f" Β· Fast-changing: {float(strict_accs.get('acc_fast_changing', 0)):.1f}% | "
250
+ f"Slow-changing: {float(strict_accs.get('acc_slow_changing', 0)):.1f}% | "
251
+ f"Never-changing: {float(strict_accs.get('acc_never_changing', 0)):.1f}%"
252
+ )
253
+ lines.append(f" Β· False premise: {float(strict_accs.get('acc_fp', 0)):.1f}%")
254
+ lines.append("")
255
+ lines.append("[제좜 메타]")
256
+ lines.append(f"- 제좜자: {name if name else 'Unknown'}")
257
+ lines.append(f"- 평가 μΌμ‹œ: {get_current_datetime_str()}")
258
+ lines.append(f"- λΉ„κ³ : Relaxed/Strict κ²°κ³Όκ°€ λ¦¬λ”λ³΄λ“œμ— λ°˜μ˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
259
+ lines.append("")
260
+ sep = "-" * 60
261
+ lines.append(sep)
262
+ lines.append("상세 κ²°κ³Ό ν…Œμ΄λΈ” (Relaxed)")
263
+ lines.append(sep)
264
+ lines.append(relaxed_table.to_string(index=False))
265
+ lines.append("")
266
+ lines.append(sep)
267
+ lines.append("상세 κ²°κ³Ό ν…Œμ΄λΈ” (Strict)")
268
+ lines.append(sep)
269
+ lines.append(strict_table.to_string(index=False))
270
+ return "\n".join(lines)
271
+
272
+ # --------- 7) 정확도 ν‘œ ----------
273
+ def _create_detailed_results_table(self, accs: dict, counts: dict) -> pd.DataFrame:
274
+ table_data = []
275
+
276
+ # 전체 정확도
277
+ table_data.append({
278
+ 'μΉ΄ν…Œκ³ λ¦¬': '전체 정확도',
279
+ '전체': f"{accs.get('acc', 0):.1f}% ({counts.get('acc', 0)}개)",
280
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get('acc_test', 0):.1f}% ({counts.get('acc_test', 0)}개)",
281
+ '개발': f"{accs.get('acc_dev', 0):.1f}% ({counts.get('acc_dev', 0)}개)"
282
+ })
283
+
284
+ # 사싀 μœ ν˜•λ³„ 정확도
285
+ fact_types = {
286
+ 'fast_changing': 'λΉ λ₯΄κ²Œ λ³€ν•˜λŠ” 사싀',
287
+ 'slow_changing': '천천히 λ³€ν•˜λŠ” 사싀',
288
+ 'never_changing': 'λ³€ν•˜μ§€ μ•ŠλŠ” 사싀'
289
+ }
290
+
291
+ for key, name in fact_types.items():
292
+ table_data.append({
293
+ 'μΉ΄ν…Œκ³ λ¦¬': name,
294
+ '전체': f"{accs.get(f'acc_{key}', 0):.1f}% ({counts.get(f'acc_{key}', 0)}개)",
295
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get(f'acc_test_{key}', 0):.1f}% ({counts.get(f'acc_test_{key}', 0)}개)",
296
+ '개발': f"{accs.get(f'acc_dev_{key}', 0):.1f}% ({counts.get(f'acc_dev_{key}', 0)}개)"
297
+ })
298
+
299
+ # 질문 μœ ν˜•λ³„ 정확도
300
+ question_types = {
301
+ 'vp': 'μœ νš¨ν•œ μ „μ œ (Valid Premise)',
302
+ 'fp': '잘λͺ»λœ μ „μ œ (False Premise)'
303
+ }
304
+
305
+ for key, name in question_types.items():
306
+ table_data.append({
307
+ 'μΉ΄ν…Œκ³ λ¦¬': name,
308
+ '전체': f"{accs.get(f'acc_{key}', 0):.1f}% ({counts.get(f'acc_{key}', 0)}개)",
309
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get(f'acc_test_{key}', 0):.1f}% ({counts.get(f'acc_test_{key}', 0)}개)",
310
+ '개발': f"{accs.get(f'acc_dev_{key}', 0):.1f}% ({counts.get(f'acc_dev_{key}', 0)}개)"
311
+ })
312
+
313
+ # 홉 μˆ˜λ³„ 정확도
314
+ table_data.append({
315
+ 'μΉ΄ν…Œκ³ λ¦¬': f" β”” {name} (단일 홉)",
316
+ '전체': f"{accs.get(f'acc_{key}_one_hop', 0):.1f}% ({counts.get(f'acc_{key}_one_hop', 0)}개)",
317
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get(f'acc_test_{key}_one_hop', 0):.1f}% ({counts.get(f'acc_test_{key}_one_hop', 0)}개)",
318
+ '개발': f"{accs.get(f'acc_dev_{key}_one_hop', 0):.1f}% ({counts.get(f'acc_dev_{key}_one_hop', 0)}개)"
319
+ })
320
+
321
+ table_data.append({
322
+ 'μΉ΄ν…Œκ³ λ¦¬': f" β”” {name} (닀쀑 홉)",
323
+ '전체': f"{accs.get(f'acc_{key}_two_hop', 0):.1f}% ({counts.get(f'acc_{key}_two_hop', 0)}개)",
324
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get(f'acc_test_{key}_two_hop', 0):.1f}% ({counts.get(f'acc_test_{key}_two_hop', 0)}개)",
325
+ '개발': f"{accs.get(f'acc_dev_{key}_two_hop', 0):.1f}% ({counts.get(f'acc_dev_{key}_two_hop', 0)}개)"
326
+ })
327
+
328
+ # 연도별 정확도
329
+ table_data.append({
330
+ 'μΉ΄ν…Œκ³ λ¦¬': f" β”” {name} (였래된 데이터)",
331
+ '전체': f"{accs.get(f'acc_{key}_old', 0):.1f}% ({counts.get(f'acc_{key}_old', 0)}개)",
332
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get(f'acc_test_{key}_old', 0):.1f}% ({counts.get(f'acc_test_{key}_old', 0)}개)",
333
+ '개발': f"{accs.get(f'acc_dev_{key}_old', 0):.1f}% ({counts.get(f'acc_dev_{key}_old', 0)}개)"
334
+ })
335
+
336
+ table_data.append({
337
+ 'μΉ΄ν…Œκ³ λ¦¬': f" β”” {name} (졜�� 데이터)",
338
+ '전체': f"{accs.get(f'acc_{key}_new', 0):.1f}% ({counts.get(f'acc_{key}_new', 0)}개)",
339
+ 'ν…ŒμŠ€νŠΈ': f"{accs.get(f'acc_test_{key}_new', 0):.1f}% ({counts.get(f'acc_test_{key}_new', 0)}개)",
340
+ '개발': f"{accs.get(f'acc_dev_{key}_new', 0):.1f}% ({counts.get(f'acc_dev_{key}_new', 0)}개)"
341
+ })
342
+
343
+ return pd.DataFrame(table_data)
344
+
345
+ # --------- 8) λ¦¬λ”λ³΄λ“œ ν–‰ 생성 ----------
346
+ def _build_leaderboard_rows(
347
+ self,
348
+ name: str,
349
+ submit_model: str,
350
+ submit_description: Optional[str],
351
+ mode: str,
352
+ accs: dict
353
+ ):
354
+ submitter_id = f"{name}".strip()
355
+ result = {
356
+ 'id': submitter_id if submitter_id else "Unknown",
357
+ 'model': submit_model,
358
+ 'description': submit_description,
359
+ 'accuracy': float(accs.get('acc_test', 0)),
360
+ 'fast_changing_accuracy': float(accs.get('acc_test_fast_changing', 0)),
361
+ 'slow_changing_accuracy': float(accs.get('acc_test_slow_changing', 0)),
362
+ 'never_changing_accuracy': float(accs.get('acc_test_never_changing', 0)),
363
+ 'acc_vp': float(accs.get('acc_test_vp', 0)),
364
+ 'acc_fp': float(accs.get('acc_test_fp', 0)),
365
+ 'acc_vp_one_hop': float(accs.get('acc_test_vp_one_hop', 0)),
366
+ 'acc_vp_two_hop': float(accs.get('acc_test_vp_two_hop', 0)),
367
+ 'acc_fp_one_hop': float(accs.get('acc_test_fp_one_hop', 0)),
368
+ 'acc_fp_two_hop': float(accs.get('acc_test_fp_two_hop', 0)),
369
+ 'acc_vp_old': float(accs.get('acc_test_vp_old', 0)),
370
+ 'acc_vp_new': float(accs.get('acc_test_vp_new', 0)),
371
+ 'acc_fp_old': float(accs.get('acc_test_fp_old', 0)),
372
+ 'acc_fp_new': float(accs.get('acc_test_fp_new', 0)),
373
+ # 도메인별 정확도 μΆ”κ°€ (test 결과만 μ‚¬μš©)
374
+ 'acc_politics': float(accs.get('acc_test_politics', 0)),
375
+ 'acc_sports': float(accs.get('acc_test_sports', 0)),
376
+ 'acc_entertainment': float(accs.get('acc_test_entertainment', 0)),
377
+ 'acc_weather': float(accs.get('acc_test_weather', 0)),
378
+ 'acc_world': float(accs.get('acc_test_world', 0)),
379
+ 'acc_economy': float(accs.get('acc_test_economy', 0)),
380
+ 'acc_society': float(accs.get('acc_test_society', 0)),
381
+ 'acc_it_science': float(accs.get('acc_test_it_science', 0)),
382
+ 'acc_life_culture': float(accs.get('acc_test_life_culture', 0)),
383
+ 'acc_unknown': float(accs.get('acc_test_unknown', 0)),
384
+ 'total_questions': int(accs.get('acc_test', 0)),
385
+ 'evaluation_date': get_current_datetime_str(),
386
+ 'evaluation_mode': mode
387
+ }
388
+ return result
389
+
390
+ def _save_leaderboard(
391
+ self,
392
+ name: str,
393
+ submit_model: str,
394
+ submit_description: Optional[str],
395
+ relaxed_accs: dict,
396
+ strict_accs: dict
397
+ ):
398
+ rows = [
399
+ self._build_leaderboard_rows(name, submit_model, submit_description, 'Relaxed', relaxed_accs),
400
+ self._build_leaderboard_rows(name, submit_model, submit_description, 'Strict', strict_accs),
401
+ ]
402
+ try:
403
+ append_to_leaderboard_data(rows)
404
+ except Exception as e:
405
+ print(f"⚠️ λ¦¬λ”λ³΄λ“œ μ €μž₯ μ‹€νŒ¨: {e}")
406
+
407
+
408
+ # --------- 9) 곡개 μ—”λ“œν¬μΈνŠΈ(핡심) ----------
409
+ def process_submission(
410
+ self,
411
+ file,
412
+ name: str,
413
+ submit_model: str,
414
+ submit_description: str,
415
+ progress: gr.Progress = gr.Progress()
416
+ ) -> str:
417
+ """
418
+ 제좜 파일 처리 및 평가
419
+ - λ‚΄λΆ€ helperλŠ” Result 기반으둜 리턴
420
+ - μ΅œμ’… Gradio 좜λ ₯은 λ¬Έμžμ—΄(κΈ°μ‘΄ ν˜Έν™˜)
421
+ """
422
+ start = time.time()
423
+
424
+ normalized_model = (submit_model or "").strip() or "Anonymous Model"
425
+ normalized_description_raw = (submit_description or "").strip()
426
+ normalized_description = normalized_description_raw if normalized_description_raw else None
427
+
428
+ # 1) 제좜 μ œν•œ 확인
429
+ tracker: Optional[SubmissionTracker] = None
430
+ if self.enable_limit:
431
+ tracker = self.tracker or get_submission_tracker()
432
+ if tracker is not None:
433
+ self.tracker = tracker
434
+ if self.enable_limit and tracker:
435
+ try:
436
+ can_submit, message, remaining = tracker.can_submit()
437
+ if not can_submit:
438
+ return f"❌ 제좜 μ œν•œ: {message}"
439
+ except Exception as e:
440
+ return f"❌ 제좜 μ œν•œ 확인 μ‹€νŒ¨: {e}"
441
+
442
+ # 2) 파일 검증
443
+ progress(0.05, desc="제좜 파일 검증 쀑...")
444
+ v = self._validate_submission_file(file)
445
+ if not v.ok:
446
+ return v.error or "❌ 제좜 파일 검증 μ‹€νŒ¨"
447
+
448
+ # 3) λ‘œλ“œ
449
+ progress(0.1, desc="κΈ°μ€€ 데이터 λ‘œλ“œ 쀑...")
450
+ loaded = self._load_submission_df(file)
451
+ if not loaded.ok:
452
+ return loaded.error or "❌ CSV λ‘œλ”© μ‹€νŒ¨"
453
+ submission_df: pd.DataFrame = loaded.data
454
+
455
+ # 4) 병합
456
+ progress(0.15, desc="κΈ°μ€€ 데이터와 병합 쀑...")
457
+ mg = self._merge_with_base(submission_df, file.name)
458
+ if not mg.ok:
459
+ return mg.error or "❌ κΈ°μ€€ 데이터 병합 μ‹€νŒ¨"
460
+ merged_df: pd.DataFrame = mg.data
461
+
462
+ # 5) 평가 (0.15 ~ 0.9 ꡬ간 μ§„ν–‰λ₯  λ§€ν•‘)
463
+ progress(0.15, desc="FreshQA 평가 μ€€λΉ„ 쀑...")
464
+
465
+ def on_inner_progress(done: int, total: int, desc: str):
466
+ frac = 0.15 + 0.75 * (done / max(total, 1))
467
+ progress(frac, desc=desc)
468
+
469
+ ev = self._evaluate_freshqa(merged_df, on_progress=on_inner_progress)
470
+ if not ev.ok:
471
+ # μ‹€νŒ¨ 기둝
472
+ if self.enable_limit and tracker:
473
+ try:
474
+ tracker.record_submission(
475
+ name,
476
+ os.path.basename(file.name),
477
+ success=False,
478
+ error_message=ev.error or "평가 μ‹€νŒ¨",
479
+ submit_model=normalized_model,
480
+ submit_description=normalized_description,
481
+ )
482
+ except Exception:
483
+ pass
484
+ return ev.error or "❌ 평가 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€"
485
+
486
+ relaxed_df, strict_df = ev.data # type: ignore[assignment]
487
+
488
+ # 6) κ²°κ³Ό 집계
489
+ progress(0.8, desc="평가 κ²°κ³Ό 뢄석 쀑...")
490
+ r = self._calculate_accuracy(relaxed_df)
491
+ if not r.ok:
492
+ if self.enable_limit and tracker:
493
+ try:
494
+ tracker.record_submission(
495
+ name,
496
+ os.path.basename(file.name),
497
+ success=False,
498
+ error_message=r.error or "집계 μ‹€νŒ¨",
499
+ submit_model=normalized_model,
500
+ submit_description=normalized_description,
501
+ )
502
+ except Exception:
503
+ pass
504
+ return r.error or "❌ κ²°κ³Ό 집계 μ‹€νŒ¨"
505
+
506
+ s = self._calculate_accuracy(strict_df)
507
+ if not s.ok:
508
+ if self.enable_limit and tracker:
509
+ try:
510
+ tracker.record_submission(
511
+ name,
512
+ os.path.basename(file.name),
513
+ success=False,
514
+ error_message=s.error or "집계 μ‹€νŒ¨",
515
+ submit_model=normalized_model,
516
+ submit_description=normalized_description,
517
+ )
518
+ except Exception:
519
+ pass
520
+ return s.error or "❌ κ²°κ³Ό 집계 μ‹€νŒ¨"
521
+
522
+ relaxed_processed, relaxed_accs, relaxed_counts = r.data # type: ignore[misc]
523
+ strict_processed, strict_accs, strict_counts = s.data # type: ignore[misc]
524
+
525
+ # 7) μš”μ•½/ν‘œ
526
+ relaxed_table = self._create_detailed_results_table(relaxed_accs, relaxed_counts)
527
+ strict_table = self._create_detailed_results_table(strict_accs, strict_counts)
528
+
529
+ result_summary = self._get_result_summary(
530
+ file_name=file.name if file else "",
531
+ name=name,
532
+ relaxed_accs=relaxed_accs,
533
+ strict_accs=strict_accs,
534
+ relaxed_table=relaxed_table,
535
+ strict_table=strict_table,
536
+ )
537
+
538
+ # 8) 제좜 성곡 기둝 및 λ¦¬λ”λ³΄λ“œ μ €μž₯
539
+ if self.enable_limit and tracker:
540
+ progress(0.85, desc="제좜 λ‚΄μ—­ μ €μž₯ 쀑...")
541
+ save_ok = tracker.record_submission(
542
+ name,
543
+ os.path.basename(file.name),
544
+ success=True,
545
+ submit_model=normalized_model,
546
+ submit_description=normalized_description,
547
+ )
548
+ progress(0.9, desc="λ¦¬λ”λ³΄λ“œ μ—…λ°μ΄νŠΈ 쀑...")
549
+ self._save_leaderboard(name, normalized_model, normalized_description, relaxed_accs, strict_accs)
550
+ else:
551
+ self._save_leaderboard(name, normalized_model, normalized_description, relaxed_accs, strict_accs)
552
+
553
+ # 9) κ²°κ³Ό λ¬Έμžμ—΄ ꡬ성
554
+ progress(1.0, desc="μ™„λ£Œ")
555
+ return result_summary
556
+
557
+
558
+ # -------------------------
559
+ # λͺ¨λ“ˆ-레벨 μ—”νŠΈλ¦¬ν¬μΈνŠΈ (κΈ°μ‘΄ UI ν˜Έν™˜)
560
+ # -------------------------
561
+ def process_submission(
562
+ file,
563
+ name: str,
564
+ submit_model: str,
565
+ submit_description: str,
566
+ progress: gr.Progress = gr.Progress()
567
+ ) -> str:
568
+ """
569
+ Gradioμ—μ„œ 직접 ν˜ΈμΆœν•˜λŠ” μ—”νŠΈλ¦¬ν¬μΈνŠΈ.
570
+ λ‚΄λΆ€μ μœΌλ‘œ DIλ₯Ό μ μš©ν•œ SubmissionHandlerλ₯Ό 생성해 ν˜ΈμΆœν•œλ‹€.
571
+ """
572
+ tracker = get_submission_tracker() if Config.ENABLE_SUBMISSION_LIMIT else None
573
+ handler = SubmissionHandler(tracker=tracker, cfg=Config)
574
+ try:
575
+ return handler.process_submission(
576
+ file=file,
577
+ name=name,
578
+ submit_model=submit_model,
579
+ submit_description=submit_description,
580
+ progress=progress,
581
+ )
582
+ except Exception as e:
583
+ # μ΅œμƒμœ„ λ³΄ν˜Έλ§‰: μ˜ˆμƒμΉ˜ λͺ»ν•œ μ˜ˆμ™Έλ„ μ‚¬μš©μž μΉœν™”μ μœΌλ‘œ λ°˜ν™˜
584
+ try:
585
+ tracking_user_id = None
586
+ if handler.enable_limit and handler.tracker:
587
+ # λˆ„κ°€ μ œμΆœν–ˆλŠ”μ§€λŠ” trackerκ°€ μ•Œκ³  μžˆλ‹€λ©΄ 기둝
588
+ try:
589
+ tracking_user_id = handler.tracker.get_user_id()
590
+ except Exception:
591
+ tracking_user_id = None
592
+ if handler.enable_limit and handler.tracker:
593
+ handler.tracker.record_submission(
594
+ name=name,
595
+ file_name=os.path.basename(file.name) if file else "(unknown)",
596
+ success=False,
597
+ error_message=str(e),
598
+ submit_model=(submit_model or "").strip() or "Anonymous Model",
599
+ submit_description=(submit_description or "").strip() or None,
600
+ )
601
+ except Exception:
602
+ # 기둝 μ‹€νŒ¨λŠ” 쑰용히 λ¬΄μ‹œ
603
+ pass
604
+
605
+ total_time = 0.0 # μƒλ‹¨μ—μ„œ μΈ‘μ •ν•˜μ§€ λͺ»ν–ˆμ„ 수 μžˆμœΌλ―€λ‘œ 0으둜
606
+ error_message = str(e)
607
+
608
+ return (
609
+ "❌ 평가 μ‹€νŒ¨\n\n"
610
+ "였λ₯˜ λ‚΄μš©:\n"
611
+ f"{error_message}\n\n"
612
+ f"μ†Œμš” μ‹œκ°„: {total_time:.2f}초 ({total_time/60:.2f}λΆ„)\n\n"
613
+ "μ œμΆœμ€ μ •μƒμ μœΌλ‘œ μ²˜λ¦¬λ˜μ—ˆμ§€λ§Œ, 평가 κ³Όμ •μ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€.\n"
614
+ "제좜 기둝은 μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€."
615
+ )
src/submission_tracker.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ μ‚¬μš©μž 제좜 좔적 λͺ¨λ“ˆ
3
+ HuggingFace μ‚¬μš©μž IDλ₯Ό 기반으둜 ν•˜λ£¨ 3번 μ œν•œ κΈ°λŠ₯을 μ œκ³΅ν•©λ‹ˆλ‹€.
4
+ 제좜 μ •λ³΄λŠ” λ³„λ„μ˜ HuggingFace repositoryμ—μ„œ κ΄€λ¦¬λ©λ‹ˆλ‹€.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import pandas as pd
10
+ import tempfile
11
+ from datetime import datetime, date
12
+ from typing import Dict, List, Optional, Tuple
13
+ from huggingface_hub import whoami, hf_hub_download, login, HfApi
14
+ import pytz
15
+ from src.utils import file_lock, get_current_date_str, get_current_datetime_str
16
+
17
+ # ν•œκ΅­ μ‹œκ°„λŒ€ μ„€μ •
18
+ KOREA_TZ = pytz.timezone('Asia/Seoul')
19
+
20
+
21
+ class SubmissionTracker:
22
+ """μ‚¬μš©μž 제좜 좔적 클래슀 - HuggingFace Repository 기반"""
23
+
24
+ def __init__(self,
25
+ repo_id: Optional[str] = None,
26
+ token: Optional[str] = None,
27
+ filename: str = "user_submissions.json"):
28
+ """
29
+ Args:
30
+ repo_id: HuggingFace repository ID (예: "username/submission-tracker")
31
+ token: HuggingFace API 토큰 (None이면 ν™˜κ²½λ³€μˆ˜μ—μ„œ μžλ™ λ‘œλ“œ)
32
+ filename: 제좜 기둝 파일λͺ…
33
+ """
34
+ # ν™˜κ²½λ³€μˆ˜μ—μ„œ μ„€μ • κ°€μ Έμ˜€κΈ°
35
+ self.repo_id = repo_id or os.getenv('SUBMISSION_TRACKER_REPO_ID')
36
+ self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACE_HUB_TOKEN')
37
+ self.filename = filename
38
+
39
+ if not self.repo_id:
40
+ raise ValueError(
41
+ "SUBMISSION_TRACKER_REPO_ID ν™˜κ²½λ³€μˆ˜κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. "
42
+ "λ˜λŠ” repo_idλ₯Ό 직접 μ „λ‹¬ν•΄μ£Όμ„Έμš”."
43
+ )
44
+
45
+ if not self.token:
46
+ raise ValueError(
47
+ "HuggingFace 토큰이 ν•„μš”ν•©λ‹ˆλ‹€. "
48
+ "토큰을 직접 μ „λ‹¬ν•˜κ±°λ‚˜ HF_TOKEN ν™˜κ²½λ³€μˆ˜λ₯Ό μ„€μ •ν•˜μ„Έμš”."
49
+ )
50
+
51
+ # HuggingFace API μ΄ˆκΈ°ν™”
52
+ self.api = HfApi()
53
+ try:
54
+ login(token=self.token)
55
+ # βœ… HuggingFace에 μ„±κ³΅μ μœΌλ‘œ λ‘œκ·ΈμΈλ˜μ—ˆμŠ΅λ‹ˆλ‹€.
56
+ except Exception as e:
57
+ print(f"❌ HuggingFace 둜그인 μ‹€νŒ¨: {e}")
58
+ raise
59
+
60
+ # 제좜 기둝 λ‘œλ“œ
61
+ self.submissions = self.load_submissions()
62
+
63
+ def load_submissions(self) -> Dict:
64
+ """HuggingFace repositoryμ—μ„œ 제좜 기둝 λ‘œλ“œ"""
65
+ try:
66
+ # πŸ“₯ HuggingFace repositoryμ—μ„œ 제좜 기둝 λ‘œλ“œ 쀑: {self.repo_id}/{self.filename}
67
+
68
+ # μž„μ‹œ 디렉토리에 파일 λ‹€μš΄λ‘œλ“œ
69
+ with tempfile.TemporaryDirectory() as temp_dir:
70
+ file_path = hf_hub_download(
71
+ repo_id=self.repo_id,
72
+ filename=self.filename,
73
+ local_dir=temp_dir,
74
+ repo_type="dataset",
75
+ token=self.token
76
+ )
77
+
78
+ # JSON 파일 λ‘œλ“œ
79
+ with open(file_path, 'r', encoding='utf-8') as f:
80
+ submissions = json.load(f)
81
+
82
+ # βœ… 제좜 기둝 λ‘œλ“œ μ™„λ£Œ: {len(submissions)}λͺ…μ˜ μ‚¬μš©μž 기둝
83
+ return submissions
84
+
85
+ except Exception as e:
86
+ print(f"⚠️ 제좜 기둝 λ‘œλ“œ μ‹€νŒ¨ (μƒˆλ‘œ μ‹œμž‘): {e}")
87
+ return {}
88
+
89
+ def get_user_id(self) -> Optional[str]:
90
+ """HuggingFaceμ—μ„œ ν˜„μž¬ μ‚¬μš©μž ID κ°€μ Έμ˜€κΈ° (고유 ID μ‚¬μš©)"""
91
+ try:
92
+ user_info = whoami()
93
+ # 고유 ID μ‚¬μš© (λ³€κ²½ λΆˆκ°€λŠ₯ν•œ μ‹λ³„μž)
94
+ return user_info.get("id", None)
95
+ except Exception as e:
96
+ print(f"⚠️ μ‚¬μš©μž ID κ°€μ Έμ˜€κΈ° μ‹€νŒ¨: {e}")
97
+ raise Exception("❌ μ‚¬μš©μž IDλ₯Ό κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€. HuggingFace에 λ‘œκ·ΈμΈλ˜μ–΄ μžˆλŠ”μ§€ ν™•μΈν•΄μ£Όμ„Έμš”.")
98
+
99
+ def get_today_submissions(self, user_id: str) -> List[Dict]:
100
+ """였늘 μ‚¬μš©μžμ˜ 제좜 기둝 κ°€μ Έμ˜€κΈ°"""
101
+ today = get_current_date_str()
102
+ user_submissions = self.submissions.get(user_id, {})
103
+ return user_submissions.get(today, [])
104
+
105
+ def can_submit(self, submissions_data: Optional[Dict] = None) -> Tuple[bool, str, int]:
106
+ """μ‚¬μš©μžκ°€ μ œμΆœν•  수 μžˆλŠ”μ§€ 확인"""
107
+ user_id = self.get_user_id()
108
+ data = submissions_data if submissions_data is not None else self.submissions
109
+ today = get_current_date_str()
110
+ today_submissions = data.get(user_id, {}).get(today, [])
111
+ successful_count = len([s for s in today_submissions if s.get('success', False)])
112
+
113
+ if successful_count >= 3:
114
+ raise Exception("❌ 였늘 제좜 ν•œλ„λ₯Ό μ΄ˆκ³Όν–ˆμŠ΅λ‹ˆλ‹€. 내일 λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”.")
115
+
116
+ remaining = 3 - successful_count
117
+ return True, f"βœ… 제좜 κ°€λŠ₯ν•©λ‹ˆλ‹€. (였늘 {successful_count}/3회 μ‚¬μš©, {remaining}회 λ‚¨μŒ)", remaining
118
+
119
+ def record_submission(
120
+ self,
121
+ submitter_name: str,
122
+ file_name: str,
123
+ success: bool,
124
+ error_message: str = None,
125
+ submit_model: Optional[str] = None,
126
+ submit_description: Optional[str] = None
127
+ ) -> bool:
128
+ """제좜 기둝 μΆ”κ°€ (파일 잠금으둜 보호)"""
129
+ user_id = self.get_user_id()
130
+
131
+ # 잠금 파일 경둜 생성
132
+ lock_file_path = tempfile.gettempdir() + f'/{self.repo_id.replace("/", "_")}.lock'
133
+
134
+ # 파일 잠금으둜 전체 과정을 atomicν•˜κ²Œ 보호
135
+ with file_lock(lock_file_path):
136
+ try:
137
+ # μ΅œμ‹  데이터λ₯Ό λ‹€μ‹œ λ‘œλ“œ (λ‹€λ₯Έ ν”„λ‘œμ„ΈμŠ€μ—μ„œ μ—…λ°μ΄νŠΈν–ˆμ„ 수 있음)
138
+ latest_submissions = self.load_submissions()
139
+
140
+ # Lock λ‚΄λΆ€μ—μ„œ μ΅œμ‹  데이터 κΈ°μ€€μœΌλ‘œ 제좜 κ°€λŠ₯ μ—¬λΆ€ μž¬ν™•μΈ
141
+ try:
142
+ can_submit, message, _ = self.can_submit(submissions_data=latest_submissions)
143
+ except Exception as e:
144
+ # 제좜 μ œν•œ 초과 μ‹œ
145
+ # 제좜 μ œν•œ 초과 λ©”μ‹œμ§€: {e}
146
+ # λ©”λͺ¨λ¦¬λ§Œ μ—…λ°μ΄νŠΈν•˜κ³  μ €μž₯ν•˜μ§€ μ•ŠμŒ
147
+ self.submissions = latest_submissions
148
+ return False
149
+
150
+ # μƒˆλ‘œμš΄ 제좜 기둝 μΆ”κ°€
151
+ current_datetime = get_current_datetime_str()
152
+
153
+ if user_id not in latest_submissions:
154
+ latest_submissions[user_id] = {}
155
+
156
+ today = get_current_date_str()
157
+ if today not in latest_submissions[user_id]:
158
+ latest_submissions[user_id][today] = []
159
+
160
+ submission_record = {
161
+ "timestamp": current_datetime,
162
+ "submitter_name": submitter_name,
163
+ "file_name": file_name,
164
+ "success": success,
165
+ "error_message": error_message,
166
+ "submit_model": submit_model,
167
+ "submit_description": submit_description
168
+ }
169
+
170
+ latest_submissions[user_id][today].append(submission_record)
171
+
172
+ # λ©”λͺ¨λ¦¬ μ—…λ°μ΄νŠΈ
173
+ self.submissions = latest_submissions
174
+
175
+ # μ €μž₯
176
+ return self._save_submissions_internal(latest_submissions)
177
+
178
+ except Exception as e:
179
+ print(f"❌ 제좜 기둝 μΆ”κ°€ μ‹€νŒ¨: {e}")
180
+ return False
181
+
182
+ def _save_submissions_internal(self, submissions_data: Dict) -> bool:
183
+ """λ‚΄λΆ€ μ €μž₯ ν•¨μˆ˜ (lock은 이미 νšλ“λœ μƒνƒœ)"""
184
+ try:
185
+ # πŸ’Ύ HuggingFace repository에 제좜 기둝 μ €μž₯ 쀑: {self.repo_id}/{self.filename}
186
+
187
+ # μž„μ‹œ νŒŒμΌμ— JSON 데이터 μ €μž₯
188
+ with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.json', delete=False) as temp_file:
189
+ json.dump(submissions_data, temp_file, ensure_ascii=False, indent=2)
190
+ temp_file_path = temp_file.name
191
+
192
+ # HuggingFace repository에 파일 μ—…λ‘œλ“œ
193
+ self.api.upload_file(
194
+ path_or_fileobj=temp_file_path,
195
+ path_in_repo=self.filename,
196
+ repo_id=self.repo_id,
197
+ repo_type="dataset",
198
+ token=self.token,
199
+ commit_message=f"Update submission records - {datetime.now(KOREA_TZ).strftime('%Y-%m-%d %H:%M:%S')}"
200
+ )
201
+
202
+ # μž„μ‹œ 파일 μ‚­μ œ
203
+ os.unlink(temp_file_path)
204
+
205
+ # βœ… 제좜 기둝 μ €μž₯ μ™„λ£Œ
206
+ return True
207
+
208
+ except Exception as e:
209
+ print(f"❌ 제좜 기둝 μ €μž₯ μ‹€νŒ¨: {e}")
210
+ return False
211
+
212
+ def get_user_submission_history(self, user_id: str, days: int = 7) -> Dict:
213
+ """μ‚¬μš©μžμ˜ 졜근 제좜 기둝 κ°€μ Έμ˜€κΈ°"""
214
+ if not user_id or user_id not in self.submissions:
215
+ return {}
216
+
217
+ user_submissions = self.submissions[user_id]
218
+ today = datetime.now(KOREA_TZ).date()
219
+
220
+ history = {}
221
+ for i in range(days):
222
+ check_date = today - pd.Timedelta(days=i)
223
+ date_str = check_date.strftime('%Y-%m-%d')
224
+
225
+ if date_str in user_submissions:
226
+ history[date_str] = user_submissions[date_str]
227
+
228
+ return history
229
+
230
+ def get_submission_stats(self, user_id: str) -> Dict:
231
+ """μ‚¬μš©μž 제좜 톡계 κ°€μ Έμ˜€κΈ°"""
232
+ if not user_id:
233
+ return {}
234
+
235
+ today_submissions = self.get_today_submissions(user_id)
236
+ successful_today_count = len([s for s in today_submissions if s.get('success', False)])
237
+ history = self.get_user_submission_history(user_id, 7)
238
+
239
+ # 톡계 계산
240
+ total_submissions = sum(len(day_submissions) for day_submissions in history.values())
241
+ successful_submissions = sum(
242
+ len([s for s in day_submissions if s.get('success', False)])
243
+ for day_submissions in history.values()
244
+ )
245
+ failed_submissions = total_submissions - successful_submissions
246
+
247
+ return {
248
+ "today_count": len(today_submissions),
249
+ "today_remaining": max(0, 3 - successful_today_count),
250
+ "week_total": total_submissions,
251
+ "week_successful": successful_submissions,
252
+ "week_failed": failed_submissions,
253
+ "history": history
254
+ }
255
+
256
+ def cleanup_old_records(self, days_to_keep: int = 30):
257
+ """였래된 제좜 기둝 정리 (파일 잠금 μ‚¬μš©)"""
258
+ # 잠금 파일 경둜 생성
259
+ lock_file_path = tempfile.gettempdir() + f'/{self.repo_id.replace("/", "_")}.lock'
260
+
261
+ # 파일 잠금으둜 전체 과정을 atomicν•˜κ²Œ 보호
262
+ with file_lock(lock_file_path):
263
+ try:
264
+ # μ΅œμ‹  데이터λ₯Ό λ‹€μ‹œ λ‘œλ“œ
265
+ latest_submissions = self.load_submissions()
266
+
267
+ cutoff_date = datetime.now(KOREA_TZ) - pd.Timedelta(days=days_to_keep)
268
+ cutoff_str = cutoff_date.strftime('%Y-%m-%d')
269
+
270
+ cleaned_count = 0
271
+ for user_id in list(latest_submissions.keys()):
272
+ user_submissions = latest_submissions[user_id]
273
+ for date_str in list(user_submissions.keys()):
274
+ if date_str < cutoff_str:
275
+ del user_submissions[date_str]
276
+ cleaned_count += 1
277
+
278
+ # 빈 μ‚¬μš©μž 기둝 제거
279
+ if not user_submissions:
280
+ del latest_submissions[user_id]
281
+
282
+ # λ©”λͺ¨λ¦¬ μ—…λ°μ΄νŠΈ
283
+ self.submissions = latest_submissions
284
+
285
+ if cleaned_count > 0:
286
+ if self._save_submissions_internal(latest_submissions):
287
+ print(f"🧹 {cleaned_count}개의 였래된 제좜 기둝을 μ •λ¦¬ν–ˆμŠ΅λ‹ˆλ‹€.")
288
+ else:
289
+ print(f"⚠️ {cleaned_count}개의 였래된 제좜 기둝을 μ •λ¦¬ν–ˆμ§€λ§Œ μ €μž₯에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€.")
290
+
291
+ return cleaned_count
292
+
293
+ except Exception as e:
294
+ print(f"❌ 였래된 기둝 정리 μ‹€νŒ¨: {e}")
295
+ return 0
296
+
297
+
298
+ def get_submission_tracker() -> Optional[SubmissionTracker]:
299
+ """SubmissionTracker μΈμŠ€ν„΄μŠ€ λ°˜ν™˜"""
300
+ try:
301
+ return SubmissionTracker()
302
+ except Exception as e:
303
+ print(f"❌ SubmissionTracker μ΄ˆκΈ°ν™” μ‹€νŒ¨: {e}")
304
+ return None
src/utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ μœ ν‹Έλ¦¬ν‹° ν•¨μˆ˜ λͺ¨λ“ˆ
3
+ κ³΅ν†΅μœΌλ‘œ μ‚¬μš©λ˜λŠ” μœ ν‹Έλ¦¬ν‹° ν•¨μˆ˜λ“€μ„ λͺ¨μ•„놓은 λͺ¨λ“ˆμž…λ‹ˆλ‹€.
4
+ """
5
+
6
+ import os
7
+ import fcntl
8
+ import pytz
9
+ from contextlib import contextmanager
10
+ from datetime import datetime
11
+
12
+ # ν•œκ΅­ μ‹œκ°„λŒ€ μ„€μ •
13
+ KOREA_TZ = pytz.timezone('Asia/Seoul')
14
+
15
+ def get_korea_datetime_now():
16
+ """ν•œκ΅­ μ‹œκ°„λŒ€μ˜ ν˜„μž¬ μ‹œκ°„μ„ λ°˜ν™˜"""
17
+ return datetime.now(KOREA_TZ)
18
+
19
+ def get_current_datetime_str(dt=None):
20
+ """ν•œκ΅­ μ‹œκ°„λŒ€μ˜ μ‹œκ°„μ„ λ¬Έμžμ—΄λ‘œ 포맷"""
21
+ if dt is None:
22
+ dt = get_korea_datetime_now()
23
+ return dt.strftime('%Y-%m-%d %H:%M:%S')
24
+
25
+ def get_current_date_str():
26
+ """ν˜„μž¬ λ‚ μ§œλ₯Ό ν•œκ΅­ μ‹œκ°„μœΌλ‘œ λ°˜ν™˜"""
27
+ return get_korea_datetime_now().strftime("%Y-%m-%d")
28
+
29
+ @contextmanager
30
+ def file_lock(lock_file_path):
31
+ """
32
+ 파일 기반 배타적 μž κΈˆμ„ μ œκ³΅ν•˜λŠ” context manager
33
+
34
+ Args:
35
+ lock_file_path: 잠금 파일 경둜
36
+
37
+ Yields:
38
+ None (λ§₯락 κ΄€λ¦¬μžλ‘œλ§Œ μ‚¬μš©)
39
+
40
+ Examples:
41
+ >>> with file_lock('/tmp/test.lock'):
42
+ ... # 잠금이 κ±Έλ¦° μƒνƒœμ—μ„œ μž‘μ—… μˆ˜ν–‰
43
+ ... pass
44
+ """
45
+ # 잠금 파일이 μ—†μœΌλ©΄ 생성
46
+ if not os.path.exists(lock_file_path):
47
+ open(lock_file_path, 'w').close()
48
+
49
+ # 잠금 νŒŒμΌμ„ μ—΄κ³  배타적 잠금 νšλ“
50
+ with open(lock_file_path, 'r') as lock_file:
51
+ try:
52
+ # 배타적 잠금 μ‹œλ„ (λ‹€λ₯Έ ν”„λ‘œμ„ΈμŠ€κ°€ λŒ€κΈ°)
53
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
54
+ # 잠금 νšλ“ 성곡, μž‘μ—… μˆ˜ν–‰
55
+ yield
56
+ finally:
57
+ # 잠금 ν•΄μ œ
58
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
ui/dataset_tab.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 데이터셋 λ‹€μš΄λ‘œλ“œ νƒ­ UI μ»΄ν¬λ„ŒνŠΈ
3
+
4
+ πŸ’Ύ 데이터셋 λ‹€μš΄λ‘œλ“œ νƒ­μ˜ UI와 λ‘œμ§μ„ κ΄€λ¦¬ν•©λ‹ˆλ‹€.
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+
10
+
11
+ def create_dataset_tab():
12
+ """데이터셋 λ‹€μš΄λ‘œλ“œ νƒ­ UI 생성"""
13
+
14
+ # 데이터셋 미리보기 λ‘œλ“œ (μ΄ˆκΈ°ν™” μ‹œ ν•œ 번만)
15
+ try:
16
+ dev_preview_data = pd.read_csv("data/public/ko-freshqa_2025_dev.csv").head(5)
17
+ test_preview_data = pd.read_csv("data/public/ko-freshqa_2025_test.csv").head(5)
18
+ except Exception as e:
19
+ print(f"⚠️ 데이터셋 미리보기 λ‘œλ“œ μ‹€νŒ¨: {e}")
20
+ dev_preview_data = pd.DataFrame()
21
+ test_preview_data = pd.DataFrame()
22
+
23
+ gr.Markdown("""
24
+ ### Ko-FreshQA Dataset
25
+
26
+ - 이 데이터셋 및 λ¦¬λ”λ³΄λ“œλŠ” [FreshQA](https://github.com/freshllms/freshqa)μ—μ„œ μ˜κ°μ„ λ°›μ•„ λ§Œλ“€μ–΄μ‘ŒμŠ΅λ‹ˆλ‹€.
27
+ - fact type(fast changing, slow changing, never changing), μ „μ œμ˜ μœ νš¨μ„±, 10개의 도메인에 따라 λ‚˜λ‰˜λŠ” μ§ˆλ¬Έλ“€μ„ 톡해 ν•œκ΅­μ–΄ 지식과 κ΄€λ ¨λœ LLM의 μ΅œμ‹ μ„±μ„ νŒλ‹¨ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
28
+ - 검증 및 평가에 ν•„μš”ν•œ 데이터셋은 주기적으둜 μ—…λ°μ΄νŠΈν•  μ˜ˆμ •μž…λ‹ˆλ‹€.
29
+
30
+ <br>
31
+
32
+ ### Ko-FreshQA 데이터셋은 μ•„λž˜μ™€ 같은 νŠΉμ§•μ„ κ°€μ§€κ³  μžˆμŠ΅λ‹ˆλ‹€.
33
+ - **fact type**
34
+ - μ‹œκ°„μ˜ 흐름에 λ”°λ₯Έ λ‹΅λ³€μ˜ 변동 κ°€λŠ₯성에 따라 μ§ˆλ¬Έμ€ μ•„λž˜μ˜ μ„Έ κ°€μ§€λ‘œ λΆ„λ₯˜λ©λ‹ˆλ‹€.
35
+ - **fast changing** : 역사적 사건, μ§„μ‹€κ³Ό 같이 닡변이 거의 λ³€ν•˜μ§€ μ•ŠλŠ” 질문
36
+ - **slow changing** : 닡변이 λͺ‡ 년에 걸쳐 λ³€ν•˜λŠ” 질문
37
+ - **never changing** : 닡변이 보톡 1λ…„ λ˜λŠ” κ·Έ 이내에 λ³€ν•˜λŠ” 질문
38
+
39
+ - **μ „μ œ μœ νš¨μ„±**
40
+ - **false premise (T/F)** : μ§ˆλ¬Έμ— ν¬ν•¨λœ μ „μ œ μžμ²΄κ°€ 잘λͺ»λ˜μ–΄ 있으면 True, μ „μ œμ— λ¬Έμ œκ°€ μ—†μœΌλ©΄ False
41
+
42
+ - **one/multi hop**
43
+ - 닡변을 μƒμ„±ν•˜κΈ° μœ„ν•΄ ν•„μš”ν•œ μΆ”λ‘ μ˜ κ°œμˆ˜μ— 따라 μ§ˆλ¬Έμ„ one hop, multi hop으둜 λΆ„λ₯˜ν•©λ‹ˆλ‹€.
44
+
45
+ - **도메인**
46
+ - λͺ¨λ“  질문과 λŒ€λ‹΅μ€ λ‹€μŒ 도메인 쀑 ν•˜λ‚˜λ‘œ λΆ„λ₯˜λ©λ‹ˆλ‹€.
47
+ - μ •μΉ˜, 슀포츠, μ—°μ˜ˆ, 날씨, 세계, 경제, μ‚¬νšŒ, IT/κ³Όν•™, μƒν™œ/λ¬Έν™”, UNK
48
+
49
+ - **λ‚˜λ¨Έμ§€ 메타 정보**
50
+ - **effective year** : 질문의 닡변이 λ§ˆμ§€λ§‰μœΌλ‘œ λ³€κ²½λœ 연도
51
+ - **next review** : μ˜ˆμƒλ˜λŠ” λ‹€μŒ κ²€ν†  λ‚ μ§œ
52
+ - **source** : 질문/닡변에 λŒ€ν•œ 정보λ₯Ό 찾을 수 μžˆλŠ” 좜처
53
+
54
+ <br>
55
+ """)
56
+
57
+
58
+ with gr.Column(elem_classes=["leaderboard-group"]):
59
+ with gr.Row():
60
+ with gr.Column():
61
+ gr.Markdown("### πŸ§ͺ DEV 데이터셋 (개발/κ²€μ¦μš©)")
62
+ gr.Markdown("""
63
+ **Dev set**: 550쌍
64
+ - λͺ¨λΈ 개발 및 검증을 μœ„ν•΄ μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
65
+ - 정닡을 λΉ„λ‘―ν•˜μ—¬ λͺ¨λ“  메타데이터가 μ œκ³΅λ©λ‹ˆλ‹€.
66
+ """)
67
+
68
+ # DEV 데이터셋 λ‹€μš΄λ‘œλ“œ λ²„νŠΌ
69
+ dev_download_btn = gr.DownloadButton(
70
+ "πŸ’Ύ DEV 데이터셋 λ‹€μš΄λ‘œλ“œ",
71
+ value="data/public/ko-freshqa_2025_dev.csv",
72
+ variant="primary",
73
+ size="lg"
74
+ )
75
+
76
+ # DEV 데이터셋 미리보기
77
+ dev_preview = gr.DataFrame(
78
+ value=lambda: pd.read_csv("data/public/ko-freshqa_2025_dev.csv").head(5),
79
+ interactive=False,
80
+ label=""
81
+ )
82
+
83
+ with gr.Column():
84
+ gr.Markdown("### 🎯 TEST 데이터셋 (μ΅œμ’… ν‰κ°€μš©)")
85
+ gr.Markdown("""
86
+ **Test set**: 3,000개
87
+ - λ¦¬λ”λ³΄λ“œ μ œμΆœμ„ μœ„ν•œ ν‰κ°€μš© λ°μ΄ν„°μ…‹μž…λ‹ˆλ‹€.
88
+ - model_responseλ₯Ό μ±„μ›Œμ„œ μ œμΆœν•΄μ£Όμ„Έμš”.
89
+ """)
90
+
91
+ # TEST 데이터셋 λ‹€μš΄λ‘œλ“œ λ²„νŠΌ
92
+ test_download_btn = gr.DownloadButton(
93
+ "πŸ’Ύ TEST 데이터셋 λ‹€μš΄λ‘œλ“œ",
94
+ value="data/public/ko-freshqa_2025_test.csv",
95
+ variant="primary",
96
+ size="lg"
97
+ )
98
+
99
+ # TEST 데이터셋 미리보기
100
+ test_preview = gr.DataFrame(
101
+ value=lambda: pd.read_csv("data/public/ko-freshqa_2025_test.csv").head(5),
102
+ interactive=False,
103
+ label=""
104
+ )
105
+
106
+ # λ‹€μš΄λ‘œλ“œ μ•ˆλ‚΄ λ©”μ‹œμ§€
107
+ gr.Markdown("""
108
+ <br>
109
+
110
+ ### πŸ’‘ λ‹€μš΄λ‘œλ“œ μ•ˆλ‚΄
111
+
112
+ - μœ„μ˜ λ‹€μš΄λ‘œλ“œ λ²„νŠΌμ„ ν΄λ¦­ν•˜λ©΄ λΈŒλΌμš°μ €μ—μ„œ μžλ™μœΌλ‘œ 파일 λ‹€μš΄λ‘œλ“œκ°€ μ‹œμž‘λ©λ‹ˆλ‹€.
113
+ - **DEV 데이터셋**은 λͺ¨λΈ 개발 및 κ²€μ¦οΏ½οΏ½οΏ½μœΌλ‘œ μ‚¬μš©ν•˜μ„Έμš”.
114
+ - **TEST 데이터셋**은 μ΅œμ’… 평가 및 λ¦¬λ”λ³΄λ“œ 제좜용으둜 μ‚¬μš©ν•˜μ„Έμš”.
115
+ - λ‹€μš΄λ‘œλ“œλœ νŒŒμΌμ€ **CSV ν˜•μ‹**, **UTF-8 인코딩**으둜 μ €μž₯λ©λ‹ˆλ‹€.
116
+
117
+ <br>
118
+ """)
119
+
120
+ # License & References
121
+ gr.Markdown("""
122
+ ### πŸ“š License & References
123
+
124
+ - λ³Έ 데이터셋은 **CC-BY-ND-NC (μ €μž‘μžν‘œμ‹œ Β· λ³€κ²½ κΈˆμ§€ Β· λΉ„μ˜λ¦¬)** λΌμ΄μ„ μŠ€λ‘œ μ œκ³΅λ©λ‹ˆλ‹€.
125
+ - 이 λ¦¬λ”λ³΄λ“œλŠ” IITP의 **β€œμƒμ„±ν˜• μ–Έμ–΄λͺ¨λΈμ˜ 지속가λŠ₯μ„±κ³Ό μ‹œκ°„μ˜ 흐름에 λ”°λ₯Έ μ΅œμ‹ μ„± λ°˜μ˜μ„ μœ„ν•œ ν•™μŠ΅ 및 ν™œμš© 기술 κ°œλ°œβ€** μ‚¬μ—…μ˜ 지원을 λ°›μ•„ μ œμž‘λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
126
+ - 이 μ‹œμŠ€ν…œμ€ FreshLLMs ν”„λ‘œμ νŠΈμ˜ **FreshQA 데이터셋과 평가 방법둠**을 기반으둜 κ΅¬μΆ•λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
127
+ - 원본 FreshQAλŠ” 링크λ₯Ό μ°Έκ³ ν•΄ μ£Όμ„Έμš”. πŸ‘‰ https://github.com/freshllms/freshqa
128
+ """)
129
+
130
+
131
+ gr.Markdown("""
132
+ ```
133
+ @misc{vu2023freshllms,
134
+ title={FreshLLMs: Refreshing Large Language Models with Search Engine Augmentation},
135
+ author={Tu Vu and Mohit Iyyer and Xuezhi Wang and Noah Constant and Jerry Wei and Jason Wei and Chris Tar and Yun-Hsuan Sung and Denny Zhou and Quoc Le and Thang Luong},
136
+ year={2023},
137
+ eprint={2310.03214},
138
+ archivePrefix={arXiv},
139
+ primaryClass={cs.CL}
140
+ }
141
+ ```
142
+ """)
ui/leaderboard_tab.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ λ¦¬λ”λ³΄λ“œ νƒ­ UI μ»΄ν¬λ„ŒνŠΈ
3
+
4
+ πŸ† Leaderboard νƒ­μ˜ UI와 λ‘œμ§μ„ κ΄€λ¦¬ν•©λ‹ˆλ‹€.
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ from src.leaderboard_manager import load_leaderboard_data, prepare_display_data
10
+
11
+
12
+ def create_leaderboard_tab():
13
+ """λ¦¬λ”λ³΄λ“œ νƒ­ UI 생성"""
14
+
15
+ # μ΅œμƒλ‹¨ 톡합 검색 λ°” - κ°œμ„ λœ λ””μžμΈ
16
+ with gr.Row():
17
+ with gr.Column(scale=12):
18
+ search_input = gr.Textbox(
19
+ label="제좜자 이름 검색",
20
+ placeholder="πŸ” 제좜자 μ΄λ¦„μœΌλ‘œ 검색...",
21
+ value="",
22
+ container=False,
23
+ elem_classes=["search-input"]
24
+ )
25
+ with gr.Column(scale=1, min_width=100):
26
+ clear_search_btn = gr.Button(
27
+ "πŸ—‘οΈ μ΄ˆκΈ°ν™”",
28
+ variant="secondary",
29
+ size="sm",
30
+ elem_classes=["clear-search-btn"]
31
+ )
32
+ with gr.Column(scale=1, min_width=100):
33
+ refresh_btn = gr.Button(
34
+ "πŸ”„ μƒˆλ‘œκ³ μΉ¨",
35
+ variant="primary",
36
+ size="sm",
37
+ elem_classes=["refresh-btn"]
38
+ )
39
+
40
+ # λ¦¬λ”λ³΄λ“œ λ…ΈμΆœ 컬럼 및 ν‘œμ‹œλͺ… μ„€μ •
41
+ DISPLAY_COLUMNS = [
42
+ 'rank',
43
+ 'id',
44
+ 'model',
45
+ 'description',
46
+ 'accuracy',
47
+ 'fast_changing_accuracy',
48
+ 'slow_changing_accuracy',
49
+ 'never_changing_accuracy',
50
+ 'acc_vp',
51
+ 'acc_fp',
52
+ 'acc_vp_one_hop',
53
+ 'acc_vp_two_hop',
54
+ 'acc_fp_one_hop',
55
+ 'acc_fp_two_hop',
56
+ 'acc_politics',
57
+ 'acc_sports',
58
+ 'acc_entertainment',
59
+ 'acc_weather',
60
+ 'acc_world',
61
+ 'acc_economy',
62
+ 'acc_society',
63
+ 'acc_it_science',
64
+ 'acc_life_culture',
65
+ 'acc_unknown'
66
+ ]
67
+
68
+ COLUMN_LABELS = {
69
+ 'rank': 'Rank',
70
+ 'id': 'ID',
71
+ 'model': 'Model',
72
+ 'description': 'Description',
73
+ 'accuracy': 'Accuracy',
74
+ 'fast_changing_accuracy': 'Fast-changing',
75
+ 'slow_changing_accuracy': 'Slow-changing',
76
+ 'never_changing_accuracy': 'Never-changing',
77
+ 'acc_vp': 'Valid Premise',
78
+ 'acc_fp': 'False Premise',
79
+ 'acc_vp_one_hop': 'VP One-hop',
80
+ 'acc_vp_two_hop': 'VP Multi-hop',
81
+ 'acc_fp_one_hop': 'FP One-hop',
82
+ 'acc_fp_two_hop': 'FP Multi-hop',
83
+ 'acc_politics': 'Politics',
84
+ 'acc_sports': 'Sports',
85
+ 'acc_entertainment': 'Entertainment',
86
+ 'acc_weather': 'Weather',
87
+ 'acc_world': 'World',
88
+ 'acc_economy': 'Economy',
89
+ 'acc_society': 'Society',
90
+ 'acc_it_science': 'IT/Science',
91
+ 'acc_life_culture': 'Life/Culture',
92
+ 'acc_unknown': 'Unknown'
93
+ }
94
+
95
+ def format_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
96
+ """λ¦¬λ”λ³΄λ“œμ— λ…ΈμΆœν•  컬럼 선택 및 헀더λͺ… λ³€ν™˜"""
97
+ if df.empty:
98
+ # 빈 DataFrame일 λ•Œλ„ 컬럼 ꡬ쑰λ₯Ό μœ μ§€ν•˜κΈ° μœ„ν•΄ 빈 DataFrame 생성
99
+ empty_df = pd.DataFrame(columns=DISPLAY_COLUMNS)
100
+ rename_map = {col: COLUMN_LABELS[col] for col in DISPLAY_COLUMNS if col in COLUMN_LABELS}
101
+ return empty_df.rename(columns=rename_map)
102
+
103
+ selected_columns = [col for col in DISPLAY_COLUMNS if col in df.columns]
104
+ formatted_df = df[selected_columns].copy()
105
+ rename_map = {col: COLUMN_LABELS[col] for col in selected_columns if col in COLUMN_LABELS}
106
+ return formatted_df.rename(columns=rename_map)
107
+
108
+ def build_leaderboard_state(source_df: pd.DataFrame):
109
+ """λ¦¬λ”λ³΄λ“œ ν‘œμ‹œμš© Relaxed/Strict 데이터와 빈 μƒνƒœ μ—¬λΆ€ λ°˜ν™˜"""
110
+ if source_df is None:
111
+ source_df = pd.DataFrame()
112
+
113
+ if source_df.empty or 'evaluation_mode' not in source_df.columns:
114
+ relaxed_df = pd.DataFrame()
115
+ strict_df = pd.DataFrame()
116
+ else:
117
+ relaxed_df = source_df.query("evaluation_mode == 'Relaxed'")
118
+ strict_df = source_df.query("evaluation_mode == 'Strict'")
119
+
120
+ formatted_relaxed = format_leaderboard(prepare_display_data(relaxed_df))
121
+ formatted_strict = format_leaderboard(prepare_display_data(strict_df))
122
+ is_empty = relaxed_df.empty and strict_df.empty
123
+ return formatted_relaxed, formatted_strict, is_empty
124
+
125
+ leaderboard_data = load_leaderboard_data()
126
+ relaxed_initial, strict_initial, is_initial_empty = build_leaderboard_state(leaderboard_data)
127
+
128
+ # Relaxed λͺ¨λ“œ λ¦¬λ”λ³΄λ“œ
129
+ with gr.Column(elem_classes=["leaderboard-group"]):
130
+ gr.Markdown(
131
+ "### 🟒 Relaxed Evaluation"
132
+ )
133
+
134
+ relaxed_leaderboard_table = gr.DataFrame(
135
+ value=relaxed_initial,
136
+ interactive=False,
137
+ wrap=False,
138
+ show_label=False,
139
+ elem_classes=["leaderboard-table"]
140
+ )
141
+
142
+ # Strict λͺ¨λ“œ λ¦¬λ”λ³΄λ“œ
143
+ with gr.Column(elem_classes=["leaderboard-group"]):
144
+ gr.Markdown(
145
+ "### πŸ”΄ Strict Evaluation"
146
+ )
147
+
148
+ strict_leaderboard_table = gr.DataFrame(
149
+ value=strict_initial,
150
+ interactive=False,
151
+ wrap=False,
152
+ show_label=False,
153
+ elem_classes=["leaderboard-table"]
154
+ )
155
+
156
+ # λ¦¬λ”λ³΄λ“œ κ΄€λ ¨ μ„€λͺ…
157
+ with gr.Column(elem_classes=["leaderboard-group"]):
158
+ gr.Markdown("""
159
+ 이 λ¦¬λ”λ³΄λ“œλŠ” [FreshQA](https://github.com/freshllms/freshqa)μ—μ„œ μ˜κ°μ„ λ°›μ•„ λ§Œλ“€μ–΄μ‘ŒμŠ΅λ‹ˆλ‹€.
160
+ fact type(fast changing, slow changing, never changing), μ „μ œμ˜ μ§„μ‹€μ„±,
161
+ 10개의 도메인에 따라 λ‚˜λ‰˜λŠ” μ§ˆλ¬Έλ“€μ„ 톡해 ν•œκ΅­μ–΄ 지식과 κ΄€λ ¨λœ LLM의 μ΅œμ‹ μ„±μ„ νŒλ‹¨ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
162
+
163
+ 이 λ¦¬λ”λ³΄λ“œλŠ” IITP의 **β€œμƒμ„±ν˜• μ–Έμ–΄λͺ¨λΈμ˜ 지속가λŠ₯μ„±κ³Ό μ‹œκ°„μ˜ 흐름에 λ”°λ₯Έ μ΅œμ‹ μ„± λ°˜μ˜μ„ μœ„ν•œ ν•™μŠ΅ 및 ν™œμš© 기술 κ°œλ°œβ€** μ‚¬μ—…μ˜ 지원을 λ°›μ•„ μ œμž‘λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
164
+
165
+ 결과의 λ¬΄κ²°μ„±Β·μœ νš¨μ„±μ„ μœ μ§€ν•˜κ³  **μˆœμœ„ μ‘°μž‘μ„ λ°©μ§€**ν•˜κΈ° μœ„ν•΄ 평가 λ°μ΄ν„°μ…‹μ˜ 정닡은 κΈ°λ°€λ‘œ μœ μ§€λ©λ‹ˆλ‹€.
166
+ """)
167
+
168
+
169
+
170
+ # 톡합 검색 ν•„ν„° ν•¨μˆ˜ (Relaxed와 Strict λͺ¨λ“œ λͺ¨λ‘ 필터링)
171
+ def filter_leaderboard_data(search_text):
172
+ """Relaxed와 Strict λͺ¨λ“œ λ¦¬λ”λ³΄λ“œ 데이터 필터링 (CSV 기반)"""
173
+ try:
174
+ # CSVμ—μ„œ 전체 데이터 λ‘œλ“œ
175
+ all_df = load_leaderboard_data()
176
+
177
+ # 검색 ν•„ν„° 적용 (제좜자 μ •λ³΄λ§Œ 검색)
178
+ if search_text.strip() and 'id' in all_df.columns:
179
+ mask = all_df['id'].str.contains(search_text, case=False, na=False)
180
+ filtered_df = all_df[mask]
181
+ else:
182
+ filtered_df = all_df
183
+
184
+ formatted_relaxed, formatted_strict, _ = build_leaderboard_state(filtered_df)
185
+ return formatted_relaxed, formatted_strict
186
+ except Exception as e:
187
+ print(f"❌ λ¦¬λ”λ³΄λ“œ 데이터 필터링 μ‹€νŒ¨: {e}")
188
+ empty = pd.DataFrame()
189
+ return empty, empty
190
+
191
+ # 검색 이벀트 μ—°κ²°
192
+ search_input.change(
193
+ fn=filter_leaderboard_data,
194
+ inputs=[search_input],
195
+ outputs=[relaxed_leaderboard_table, strict_leaderboard_table]
196
+ )
197
+
198
+ # 검색 μ΄ˆκΈ°ν™” λ²„νŠΌ
199
+ def clear_search():
200
+ try:
201
+ all_df = load_leaderboard_data()
202
+ formatted_relaxed, formatted_strict, _ = build_leaderboard_state(all_df)
203
+ return "", formatted_relaxed, formatted_strict
204
+ except Exception as e:
205
+ print(f"❌ λ¦¬λ”λ³΄λ“œ 데이터 λ‘œλ“œ μ‹€νŒ¨: {e}")
206
+ empty = pd.DataFrame()
207
+ return "", empty, empty
208
+
209
+ clear_search_btn.click(
210
+ fn=clear_search,
211
+ outputs=[search_input, relaxed_leaderboard_table, strict_leaderboard_table]
212
+ )
213
+
214
+ # μƒˆλ‘œκ³ μΉ¨ λ²„νŠΌ
215
+ def refresh_leaderboard():
216
+ try:
217
+ all_df = load_leaderboard_data()
218
+ formatted_relaxed, formatted_strict, is_empty = build_leaderboard_state(all_df)
219
+
220
+ return formatted_relaxed, formatted_strict
221
+ except Exception as e:
222
+ print(f"❌ λ¦¬λ”λ³΄λ“œ μƒˆλ‘œκ³ μΉ¨ μ‹€νŒ¨: {e}")
223
+ empty = pd.DataFrame()
224
+ return empty, empty
225
+
226
+ refresh_btn.click(
227
+ fn=refresh_leaderboard,
228
+ outputs=[relaxed_leaderboard_table, strict_leaderboard_table]
229
+ )
ui/styles.css ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* 항상 μ„Έλ‘œ μŠ€ν¬λ‘€λ°”λ₯Ό ν‘œμ‹œν•΄μ„œ 폭이 λ°”λ€Œμ§€ μ•Šκ²Œ ν•˜κΈ° */
2
+ html {
3
+ overflow-y: scroll;
4
+ }
5
+
6
+ /* ================================
7
+ κΈ°λ³Έ μ»¨ν…Œμ΄λ„ˆ μŠ€νƒ€μΌ (전체 폭 톡일)
8
+ ================================ */
9
+ .gradio-container,
10
+ .main {
11
+ max-width: 1400px !important; /* ν™”λ©΄ λ„ˆλ¬΄ λ„“μ–΄μ§€μ§€ μ•Šκ²Œ μ λ‹Ήνžˆ 넓은 고정폭 */
12
+ width: 100% !important;
13
+ margin: 0 auto !important; /* 항상 κ°€μš΄λ° μ •λ ¬ */
14
+ }
15
+
16
+ .fixed-list * {
17
+ font-size: 15px !important;
18
+ }
19
+
20
+
21
+ /* ================================
22
+ λ°μ΄ν„°ν”„λ ˆμž„ κΈ°λ³Έ μŠ€νƒ€μΌ
23
+ ================================ */
24
+ .dataframe {
25
+ font-size: 16px !important;
26
+ width: 100% !important;
27
+ }
28
+
29
+ .dataframe table {
30
+ font-size: 16px !important;
31
+ width: 100% !important;
32
+ table-layout: auto !important;
33
+ }
34
+
35
+ .dataframe th {
36
+ font-size: 18px !important;
37
+ font-weight: bold !important;
38
+ padding: 12px !important;
39
+ white-space: nowrap !important;
40
+ }
41
+
42
+ .dataframe td {
43
+ font-size: 16px !important;
44
+ padding: 10px !important;
45
+ white-space: nowrap !important;
46
+ }
47
+
48
+
49
+ /* ================================
50
+ λ¦¬λ”λ³΄λ“œ 검색 λ°” μŠ€νƒ€μΌ
51
+ ================================ */
52
+ .search-input input {
53
+ font-size: 16px !important;
54
+ padding: 12px 16px !important;
55
+ border-radius: 8px !important;
56
+ border: 2px solid #e0e0e0 !important;
57
+ transition: border-color 0.3s ease !important;
58
+
59
+ /* πŸ”Ή input 높이 κ³ μ • */
60
+ height: 40px !important;
61
+ box-sizing: border-box !important;
62
+ }
63
+
64
+ .search-input input:focus {
65
+ border-color: #4a90e2 !important;
66
+ outline: none !important;
67
+ box-shadow: 0 0 0 3px rgba(74, 144, 226, 0.1) !important;
68
+ }
69
+
70
+ /* 검색 μ˜μ—­ wrapper */
71
+ .search-input {
72
+ margin: 8px 0 12px 0 !important;
73
+ display: block;
74
+ }
75
+
76
+ .search-input input {
77
+ margin: 0 !important;
78
+ }
79
+
80
+ /* ================================
81
+ λ²„νŠΌ μŠ€νƒ€μΌ (κΈ°λ³Έ μŠ€νƒ€μΌ μœ μ§€)
82
+ ================================ */
83
+ .clear-search-btn,
84
+ .refresh-btn {
85
+ border-radius: 8px !important;
86
+ font-weight: 500 !important;
87
+
88
+ /* πŸ”Ή 검색바와 λ™μΌν•œ μ„Έλ‘œ λ†’μ΄λ‘œ 맞좀 */
89
+ height: 40px !important;
90
+ padding: 0 16px !important;
91
+
92
+ /* κΈ°μ‘΄ μŠ€νƒ€μΌ μ΅œλŒ€ν•œ μœ μ§€ */
93
+ margin-top: 4px !important;
94
+ }
95
+
96
+
97
+
98
+ /* ================================
99
+ λ¦¬λ”λ³΄λ“œ κ·Έλ£Ή/ν…Œμ΄λΈ” μ—¬λ°± 및 μΉ΄λ“œ μŠ€νƒ€μΌ
100
+ ================================ */
101
+ .leaderboard-group {
102
+ margin: 18px 0 28px 0 !important;
103
+ padding: 12px 14px !important;
104
+ border: 1px solid #eee;
105
+ border-radius: 12px;
106
+ background: #ffffff;
107
+ }
108
+
109
+ .leaderboard-table {
110
+ margin-top: 8px !important;
111
+ }
112
+
113
+ /* ν‘œ μ…€ μ—¬λ°± 보강 (λ¦¬λ”λ³΄λ“œ μ „μš©) */
114
+ .leaderboard-table .dataframe th {
115
+ padding: 12px 14px !important;
116
+ }
117
+ .leaderboard-table .dataframe td {
118
+ padding: 10px 14px !important;
119
+ }
120
+
121
+ /* ================================
122
+ 제좜 μƒνƒœ(Textbox) 슀크둀 및 μ€„λ°”κΏˆ κ°•μ œ
123
+ ================================ */
124
+ .submission-status textarea {
125
+ max-height: 420px !important;
126
+ overflow-y: auto !important;
127
+ white-space: pre-wrap !important; /* κ°œν–‰ μœ μ§€ */
128
+ word-break: break-word !important; /* κΈ΄ 단어/ν‘œ 폭주 λ°©μ§€ */
129
+ text-align: left !important; /* 쒌츑 μ •λ ¬ κ°•μ œ */
130
+ }
131
+
132
+ /* 쀑첩 슀크둀 λ°©μ§€: λž˜νΌλŠ” 슀크둀 ν•΄μ œ */
133
+ .submission-status, .submission-status .wrap {
134
+ max-height: none !important;
135
+ overflow: visible !important;
136
+ }
ui/submission_tab.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 제좜 및 평가 νƒ­ UI μ»΄ν¬λ„ŒνŠΈ
3
+
4
+ πŸ“€ 제좜 및 평가 νƒ­μ˜ UI와 λ‘œμ§μ„ κ΄€λ¦¬ν•©λ‹ˆλ‹€.
5
+ """
6
+
7
+ import gradio as gr
8
+ from src.submission_handler import process_submission
9
+
10
+
11
+ def create_submission_tab():
12
+ """제좜 및 평가 νƒ­ UI 생성"""
13
+
14
+ gr.Markdown("""
15
+ ### πŸ“‹ 제좜 방법
16
+ - 데이터셋 탭을 ν†΅ν•˜μ—¬ test set λ‹€μš΄λ‘œλ“œ
17
+ - 각 question에 λŒ€ν•œ model_response 생성
18
+ - model_responseκ°€ μ±„μ›Œμ§„ CSV 파일 μ—…λ‘œλ“œ(UTF-8 인코딩)
19
+ - 제좜자 이름(id), μ‚¬μš© λͺ¨λΈ, μ„€λͺ… μž‘μ„±ν•˜μ—¬ 제좜
20
+ - μ„€λͺ…: λ”°λ‘œ μ μš©ν•œ 방법둠이 있으면 μž‘μ„±ν•΄ μ£Όμ„Έμš”. κ³΅λž€μΌ 경우 μ‚¬μš© λͺ¨λΈμ˜ 베이슀 μ„±λŠ₯으둜 κ°„μ£Όν•©λ‹ˆλ‹€.
21
+ - μ‚¬μš© λͺ¨λΈ: responseλ₯Ό μƒμ„±ν•˜κΈ° μœ„ν•œ λͺ¨λΈμ€ 자유둭게 선택할 수 μžˆμŠ΅λ‹ˆλ‹€. μ‚¬μš©ν•œ λͺ¨λΈμ˜ **곡식 λͺ…μΉ­**을 μž‘μ„±ν•΄ μ£Όμ„Έμš”.
22
+
23
+ <br>
24
+
25
+ ### πŸ” 평가 방식
26
+ - ν‰κ°€λŠ” upstage의 μ΅œμ‹  **solar λͺ¨λΈ**둜 μ§„ν–‰λ©λ‹ˆλ‹€. *(2025-11-11 κΈ°μ€€: solar-pro2-250909)*
27
+ - 평가 κ²°κ³ΌλŠ” 전체 accuracy뿐 μ•„λ‹ˆλΌ fact type, μ „μ œ μœ νš¨μ„±, number of hop, 도메인별 λΆ„λ₯˜ μ μˆ˜λ„ μ œκ³΅ν•©λ‹ˆλ‹€.
28
+ - ν•œ 번의 제좜둜 **relaxed evaluation**κ³Ό **strict evaluation**이 λ™μ‹œμ— μ§„ν–‰λ©λ‹ˆλ‹€.
29
+
30
+ ##### πŸ”Ή relaxed evaluation
31
+ - 닡변이 κ°€μ§„ μ£Όμš” μ •λ³΄μ˜ μ •ν™•μ„±μ—λ§Œ μ΄ˆμ μ„ 맞좰 ν‰κ°€ν•©λ‹ˆλ‹€.
32
+ - ν™˜κ°μ΄λ‚˜ 였래된 정보가 ν¬ν•¨λ˜μ–΄ μžˆμ–΄λ„, μ£Όμš” 정보에 영ν–₯을 λ―ΈμΉ˜μ§€ μ•ŠμœΌλ©΄ μ •λ‹΅μœΌλ‘œ 인정될 수 μžˆμŠ΅λ‹ˆλ‹€.
33
+ - λ‹΅λ³€ ν˜•μ‹μ΄ 잘λͺ»λœ 경우(예: λ‹€λ₯Έ μ–Έμ–΄λ‘œ λ‹΅λ³€)도 ν—ˆμš©λ©λ‹ˆλ‹€.
34
+
35
+ ##### πŸ”Ή strict evaluation
36
+ - μ£Όμš” μ •λ³΄μ˜ 정확성뿐 μ•„λ‹ˆλΌ, λͺ¨λ“  사싀이 μ •ν™•ν•˜κ³  μ΅œμ‹ μ΄μ–΄μ•Ό ν•©λ‹ˆλ‹€.
37
+ - μ‚¬μ†Œν•œ ν™˜κ°μ΄λΌλ„ ν¬ν•¨λ˜λ©΄ μ •λ‹΅μœΌλ‘œ μΈμ •λ˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.
38
+ - β€œμ œ 지식은 2021λ…„ 9μ›”κΉŒμ§€μž…λ‹ˆλ‹€β€¦β€ 같은 였래된 정보 κ²½κ³  λ¬Έκ΅¬λŠ” κ·Έ λ‚΄μš©μ΄ λ³€κ²½λ˜μ§€ μ•Šμ•˜μŒμ΄ λͺ…ν™•ν•œ κ²½μš°μ—λ§Œ μ •λ‹΅μœΌλ‘œ μΈμ •λ©λ‹ˆλ‹€.
39
+
40
+ <br>
41
+
42
+ ### 🚫 제좜 μ œν•œ
43
+ - μ‚¬μš©μžλ‹Ή **ν•˜λ£¨ μ΅œλŒ€ 3회 제좜** κ°€λŠ₯ν•©λ‹ˆλ‹€.
44
+ - μ‹€νŒ¨ν•œ μ œμΆœμ€ μΉ΄μš΄νŠΈλ˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.
45
+ - 제좜 νšŸμˆ˜λŠ” **맀일 ν•œκ΅­ μ‹œκ°„ 00μ‹œ 00λΆ„**에 μ΄ˆκΈ°ν™”λ©λ‹ˆλ‹€.
46
+
47
+ <br>
48
+
49
+ ### ⏱️ 평가 μ†Œμš” μ‹œκ°„
50
+ - 평가 μ†Œμš” μ‹œκ°„μ€ **μ œμΆœλ‹Ή μ•½ 30λΆ„**으둜 μ˜ˆμƒλ©λ‹ˆλ‹€.
51
+ - λ™μ‹œμ— μ œμΆœν•œ μ°Έκ°€μžκ°€ λ§Žμ„ 경우 μ‹œκ°„μ΄ 증가할 수 μžˆμŠ΅λ‹ˆλ‹€.
52
+
53
+ <br>
54
+ """)
55
+
56
+
57
+
58
+ submission_file = gr.File(
59
+ label="정닡이 ν¬ν•¨λœ CSV 파일 μ—…λ‘œλ“œ",
60
+ file_types=['.csv']
61
+ )
62
+ submitter_name = gr.Textbox(
63
+ label="제좜자 이름",
64
+ placeholder="예: AI Ambassador",
65
+ value="Anonymous"
66
+ )
67
+ submit_model = gr.Textbox(
68
+ label="μ‚¬μš©ν•œ λͺ¨λΈ",
69
+ placeholder="μ‚¬μš©ν•œ λͺ¨λΈμ˜ 곡식 λͺ…칭을 μž‘μ„±ν•΄ μ£Όμ„Έμš”.",
70
+ value="Anonymous Model"
71
+ )
72
+ submit_description = gr.Textbox(
73
+ label="μ„€λͺ…",
74
+ placeholder="λ”°λ‘œ μ μš©ν•œ 방법둠이 있으면 μž‘μ„±ν•΄ μ£Όμ„Έμš”."
75
+ )
76
+
77
+ # 제좜 및 μ·¨μ†Œ λ²„νŠΌ
78
+ submit_btn = gr.Button(
79
+ "πŸš€ 제좜 및 평가 μ‹œμž‘",
80
+ variant="primary"
81
+ )
82
+
83
+ # 제좜 μƒνƒœ 및 κ²°κ³Ό ν…μŠ€νŠΈ
84
+ submission_status = gr.Textbox(
85
+ label="제좜 μƒνƒœ",
86
+ value="CSV νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ³  μ œμΆœν•˜μ„Έμš”.",
87
+ interactive=False,
88
+ lines=20,
89
+ elem_classes=["submission-status"]
90
+ )
91
+
92
+ # 제좜 λ²„νŠΌ 이벀트 μ—°κ²°
93
+ submit_btn.click(
94
+ fn=process_submission,
95
+ inputs=[submission_file, submitter_name, submit_model, submit_description],
96
+ outputs=[submission_status],
97
+ concurrency_limit=3
98
+ )