jisubae
chore: dataset tab text update
04df6bf
raw
history blame
14.1 kB
import re
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any, Tuple
import time
import random
from config import Config
from src.utils import get_current_date_str
class FreshEval:
def __init__(self, model: str='solar-pro2', api_key: str=None):
self.model = model
self.api_key = api_key or Config.UPSTAGE_API_KEY
self.client = OpenAI(
api_key=self.api_key,
base_url="https://api.upstage.ai/v1/solar"
)
self.temperature = 0.0
self.max_tokens = 256
self.chat_completions = True
if model.startswith('gpt-4') | model.startswith('solar'):
self.num_organic_results = 15
self.num_related_questions = 3
self.num_questions_and_answers = 3
self.num_retrieved_evidences = 15
else:
self.num_organic_results = 15
self.num_related_questions = 2
self.num_questions_and_answers = 2
self.num_retrieved_evidences = 5
def _is_rate_limit_error(self, error: Exception) -> bool:
"""429 μ—λŸ¬ 감지 ν•¨μˆ˜"""
error_str = str(error)
error_type = type(error).__name__
# 1. HTTP μƒνƒœ μ½”λ“œ 확인
if hasattr(error, 'response') and hasattr(error.response, 'status_code'):
if error.response.status_code == 429:
# print(f"βœ… HTTP 429 μ—λŸ¬ 감지: {error.response.status_code}")
return True
# 2. ν…μŠ€νŠΈ 기반 감지 (λ°±μ—…)
error_lower = error_str.lower()
if ("429" in error_lower or
"rate" in error_lower or
"limit" in error_lower or
"too_many_requests" in error_lower or
"request limit" in error_lower):
# print(f"βœ… ν…μŠ€νŠΈ 기반 429 μ—λŸ¬ 감지")
return True
return False
def call_llm_api(self, prompt:str, current_date:str) -> str:
"""LLM API 호좜 ν•¨μˆ˜ (ν‚€ νšŒμ „ 및 λ°±μ˜€ν”„ 지원)"""
from src.api_key_rotator import get_rotator
rotator = get_rotator()
num_keys = len(rotator.keys)
base_delay = 3.0
def _make_api_call(eval_instance: FreshEval) -> str:
"""API 호좜 헬퍼 ν•¨μˆ˜"""
if eval_instance.chat_completions:
# Chat completions API
response = eval_instance.client.chat.completions.create(
model=eval_instance.model,
temperature=eval_instance.temperature,
max_tokens=eval_instance.max_tokens,
messages=[
{
"role": "system",
"content": (
f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}."
)
},
{
"role": "user",
"content": "What's today's date?"
},
{
"role": "assistant",
"content": f"Today is {current_date} in Pacific Standard Time."
},
{
"role": "user",
"content": prompt
}
],
)
return response.choices[0].message.content
else:
# Completions API
response = eval_instance.client.completions.create(
model=eval_instance.model,
temperature=eval_instance.temperature,
max_tokens=eval_instance.max_tokens,
prompt=prompt,
)
return response.choices[0].text
# ν˜„μž¬ ν‚€λ‘œ μ‹œμž‘
current_key = self.api_key
current_instance = FreshEval(model=self.model, api_key=current_key)
# ν‚€κ°€ 1개인 경우: κΈ°μ‘΄ λ°±μ˜€ν”„ 둜직만 μ‚¬μš©
if num_keys == 1:
max_retries = 7
for attempt in range(max_retries):
try:
return _make_api_call(current_instance)
except Exception as e:
if self._is_rate_limit_error(e):
if attempt < max_retries - 1:
# μ§€μˆ˜μ  λ°±μ˜€ν”„
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
time.sleep(delay)
continue
# else:
# print(f"❌ μ΅œλŒ€ μž¬μ‹œλ„ 횟수 초과")
raise e
# max_retries μ΄ˆκ³Όν•  λ•ŒκΉŒμ§€ return λ˜μ§€ μ•ŠμœΌλ©΄ μ—λŸ¬ λ°œμƒ
raise Exception("call llm api:μ΅œλŒ€ μž¬μ‹œλ„ 횟수 초과")
# ν‚€κ°€ 2개 이상인 경우: ν‚€ μ „ν™˜ 둜직 (3초 λŒ€κΈ° 포함)
# 성곡할 λ•ŒκΉŒμ§€ ν‚€λ₯Ό μˆœν™˜ν•˜λ©° μ‹œλ„ (μ΅œλŒ€ λͺ¨λ“  ν‚€λ₯Ό 3λ°”ν€΄κΉŒμ§€)
max_attempts = num_keys * 3 # λͺ¨λ“  ν‚€λ₯Ό μ΅œλŒ€ 3λ°”ν€΄κΉŒμ§€ μ‹œλ„
key_attempt_count = 0
# ν˜„μž¬ ν‚€λ‘œ 첫 μ‹œλ„
for attempt in range(max_attempts):
try:
return _make_api_call(current_instance) # μ„±κ³΅ν•˜λ©΄ μ¦‰μ‹œ λ°˜ν™˜
except Exception as e:
if self._is_rate_limit_error(e):
key_attempt_count += 1
# λ‹€μŒ ν‚€λ‘œ μ „ν™˜ν•˜κΈ° 전에 2초 λŒ€κΈ°
time.sleep(2)
current_key = rotator.pick_key()
# print("πŸ”‘ ν‚€ μ „ν™˜")
current_instance = FreshEval(model=self.model, api_key=current_key)
continue # λ‹€μŒ ν‚€λ‘œ 계속 μ‹œλ„
else:
# 429κ°€ μ•„λ‹Œ μ—λŸ¬λŠ” μ¦‰μ‹œ μ „νŒŒ
raise
# μ΅œλŒ€ μ‹œλ„ 횟수 초과 (λͺ¨λ“  ν‚€λ₯Ό μ—¬λŸ¬ 바퀴 μ‹œλ„ν–ˆμ§€λ§Œ λͺ¨λ‘ μ‹€νŒ¨)
raise Exception(f"λͺ¨λ“  API ν‚€μ—μ„œ 429 μ—λŸ¬ λ°œμƒ (μ΅œλŒ€ {max_attempts}회 μ‹œλ„)")
def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str:
"""FreshEval 평가 ν•¨μˆ˜"""
fresheval_question = f'\nquestion: {question}{evaluation}'
# ν™˜κ²½λ³€μˆ˜ 기반 ν”„λ‘¬ν”„νŠΈ(본체: prefix + demo) μš°μ„  μ‚¬μš©
env_prompt_body = None
if mode == 'Relaxed':
env_prompt_body = Config.FRESHQA_PROMPT_RELAXED
elif mode == 'Strict':
env_prompt_body = Config.FRESHQA_PROMPT_STRICT
if env_prompt_body and str(env_prompt_body).strip():
base_prompt = str(env_prompt_body).strip()
else:
raise ValueError(f"{mode} 평가 ν”„λ‘¬ν”„νŠΈ 섀정이 μ—†μŠ΅λ‹ˆλ‹€.")
fresheval_prompt = base_prompt + fresheval_question
# 평가
answer = self.call_llm_api(fresheval_prompt, current_date)
return answer
def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]:
"""평가 κ²°κ³Όμ—μ„œ λ“±κΈ‰ μΆ”μΆœ"""
def _clean(text: str) -> str:
# 양끝 μž₯식/곡백 제거 + λ‚΄λΆ€ 흔적 정리 + μ†Œλ¬Έμžν™”
text = re.sub(r'^[*`_~\s]+|[*`_~\s]+$', '', text)
text = re.sub(r'[*`_~]', '', text)
return text.strip().strip('.').strip().lower()
def _judge(val: str):
"""
λ¬Έμžμ—΄μ—μ„œ correct/incorrect νŒμ •.
- 'incorrect'κ°€ 보이면 무쑰건 FALSE
- 'partially correct'λŠ” λͺ¨ν˜Έ β†’ None
- 'correct'λŠ” TRUE
"""
if re.search(r'(?i)\bincorrect\b', val):
return 'FALSE'
if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val):
return None
if re.search(r'(?i)\bcorrect\b', val):
return 'TRUE'
return None
def _from_label(block_label: str):
"""
라벨(예: 'Final Evaluation' λ˜λŠ” 'Evaluation') κΈ°μ€€μœΌλ‘œ
- 같은 쀄 캑처 λ¨Όμ € μ‹œλ„
- μ‹€νŒ¨ν•˜λ©΄ 라벨 이후 ~ λ‹€μŒ 빈 쀄 이전 λ²”μœ„μ—μ„œ νŒμ • ν‚€μ›Œλ“œ 탐색
"""
# 같은 쀄 캑처: 라벨 Β± μž₯식 Β± 콜둠 이후 ~ 쀄끝
same_line = re.search(
rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)\s*([^\r\n]+)',
response
)
if same_line:
val = _clean(same_line.group(1))
j = _judge(val)
if j is not None:
return j
# μœ„μΉ˜λ§Œ μ°Ύκ³ (κ°’ 없이 μ€„λ°”κΏˆλœ μΌ€μ΄μŠ€), λ‹€μŒ 빈 쀄(or μ„Ήμ…˜) μ „κΉŒμ§€ μŠ€μΊ”
pos = re.search(
rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)',
response
)
if pos:
tail = response[pos.end():]
# λ‹€μŒ '빈 쀄(연속 κ°œν–‰)' λ˜λŠ” λ‹€μŒ μ„Ήμ…˜ μ‹œμž‘ μ „κΉŒμ§€λ§Œ λ³Έλ‹€ (λ„ˆλ¬΄ 멀리 μ•ˆκ°€λ„λ‘)
m_stop = re.search(r'\n\s*\n', tail)
segment = tail[:m_stop.start()] if m_stop else tail[:300] # μ•ˆμ „ν•œ μƒν•œ
seg_clean = _clean(segment)
j = _judge(seg_clean)
if j is not None:
return j
return None
# 1) Final Evaluation μ΅œμš°μ„ 
final_judgement = _from_label('final\s+evaluation')
if final_judgement:
return True, {'rating': final_judgement}
# 2) Evaluation
eval_judgement = _from_label('evaluation')
if eval_judgement:
return True, {'rating': eval_judgement}
# 3) 폴백: credited λ¬Έμž₯
if re.search(r'(?i)thus,\s*the\s*response\s*is\s*credited\b', response):
return True, {'rating': 'TRUE'}
if re.search(r'(?i)thus,\s*the\s*response\s*is\s*not\s*credited\b', response):
return True, {'rating': 'FALSE'}
# 4) μ‹€νŒ¨
return False, {'rating': None}
def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]:
"""단일 ν–‰ 평가"""
question = row['question']
response = row['model_response']
correct_answers = [row[f'answer_{i}'] for i in range(10)]
correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()]
# model_responseκ°€ λΉ„μ–΄μžˆκ±°λ‚˜ NaN인 경우 λ°”λ‘œ ν‹€λ Έλ‹€λŠ” 결과둜 μ²˜λ¦¬ν•˜κ³  return
if pd.isna(response) or (isinstance(response, str) and response.strip() == ''):
# print('model_responseκ°€ λΉ„μ–΄μžˆμŒ. rating=0으둜 처리')
row_dict = row.to_dict()
row_dict['rating'] = 0
row_dict['explanation'] = "model_responseκ°€ λΉ„μ–΄μžˆμŒ"
return row_dict
# 평가 ν…œν”Œλ¦Ώ 생성
evaluation_template = (
"\ncorrect answer(s): {correct_answers}"
"\nresponse: {response}"
"\ncomment: "
)
evaluation = evaluation_template.format(
correct_answers=' | '.join(correct_answers),
response=response,
)
# 평가
fresheval_response = self.call_fresheval(
mode=mode,
question=question,
evaluation=evaluation,
current_date=current_date
)
is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
# if is_valid_eval:
# print('μ™„λ£Œ')
# μž¬ν‰κ°€ 횟수 μ œν•œ (μ΅œλŒ€ 5회)
max_retries = 5
retry_count = 0
# μž¬μ‹œλ„ loop
while not is_valid_eval and retry_count < max_retries:
retry_count += 1
# print(f'μœ νš¨ν•˜μ§€ μ•Šμ€ 평가, μž¬ν‰κ°€ 쀑... ({retry_count}/{max_retries})\n response: {fresheval_response}')
fresheval_response = self.call_fresheval(
mode=mode,
question=question,
evaluation=evaluation,
current_date=current_date
)
is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
# if is_valid_eval:
# print('μ™„λ£Œ')
# μ΅œλŒ€ μž¬μ‹œλ„ 횟수 초과 μ‹œ κΈ°λ³Έ κ°’ μ‚¬μš©
if not is_valid_eval:
# print(f'⚠️ μ΅œλŒ€ μž¬μ‹œλ„ 횟수({max_retries}) 초과. κΈ°λ³Έκ°’ μ‚¬μš©: rating=0')
eval_result = {'rating': 0}
fresheval_response = "μž¬μ‹œλ„ 횟수 초과둜 μΈν•œ κΈ°λ³Έ 평가"
row_dict = row.to_dict()
row_dict['rating'] = eval_result['rating']
row_dict['explanation'] = fresheval_response
# πŸ“Š DEBUG: FALSE인 κ²½μš°μ—λ§Œ 상세 좜λ ₯
# if eval_result['rating'] == 'FALSE':
# print(f"\n{'='*80}")
# print(f"❌ FALSE ν‰κ°€λœ 질문")
# print(f" Mode: {mode}")
# print(f" Question: {question}")
# print(f" Correct Answers: {' | '.join(correct_answers)}")
# print(f" Model Response: {response}")
# print(f"\n LLM 평가 응닡:")
# print(f" {fresheval_response}")
# print(f" μ΅œμ’… Rating: {eval_result['rating']}")
# print(f"{'='*80}\n")
return row_dict
def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame:
"""λ°μ΄ν„°ν”„λ ˆμž„ 평가"""
freshevals = []
current_date = get_current_date_str()
len_df = len(df)
for index, row in df.iterrows():
print(f'{mode} 평가 쀑... {index+1}/{len_df}')
row_dict = self.evaluate_single_row(row, mode, current_date)
freshevals.append(row_dict)
return pd.DataFrame(freshevals)