Spaces:

upstage
/

ko-freshqa-leaderboard

Running

ko-freshqa-leaderboard / freshqa /fresheval.py

jisubae

chore: dataset tab text update

04df6bf about 1 month ago

14.1 kB

	import re
	import pandas as pd
	from openai import OpenAI
	from typing import List, Dict, Any, Tuple
	import time
	import random

	from config import Config
	from src.utils import get_current_date_str

	class FreshEval:

	def __init__(self, model: str='solar-pro2', api_key: str=None):
	self.model = model
	self.api_key = api_key or Config.UPSTAGE_API_KEY
	self.client = OpenAI(
	api_key=self.api_key,
	base_url="https://api.upstage.ai/v1/solar"
	)

	self.temperature = 0.0
	self.max_tokens = 256
	self.chat_completions = True

	if model.startswith('gpt-4') \| model.startswith('solar'):
	self.num_organic_results = 15
	self.num_related_questions = 3
	self.num_questions_and_answers = 3
	self.num_retrieved_evidences = 15
	else:
	self.num_organic_results = 15
	self.num_related_questions = 2
	self.num_questions_and_answers = 2
	self.num_retrieved_evidences = 5


	def _is_rate_limit_error(self, error: Exception) -> bool:
	"""429 에러 감지 함수"""
	error_str = str(error)
	error_type = type(error).__name__

	# 1. HTTP 상태 코드 확인
	if hasattr(error, 'response') and hasattr(error.response, 'status_code'):
	if error.response.status_code == 429:
	# print(f"✅ HTTP 429 에러 감지: {error.response.status_code}")
	return True

	# 2. 텍스트 기반 감지 (백업)
	error_lower = error_str.lower()
	if ("429" in error_lower or
	"rate" in error_lower or
	"limit" in error_lower or
	"too_many_requests" in error_lower or
	"request limit" in error_lower):
	# print(f"✅ 텍스트 기반 429 에러 감지")
	return True

	return False


	def call_llm_api(self, prompt:str, current_date:str) -> str:
	"""LLM API 호출 함수 (키 회전 및 백오프 지원)"""
	from src.api_key_rotator import get_rotator

	rotator = get_rotator()
	num_keys = len(rotator.keys)
	base_delay = 3.0

	def _make_api_call(eval_instance: FreshEval) -> str:
	"""API 호출 헬퍼 함수"""
	if eval_instance.chat_completions:
	# Chat completions API
	response = eval_instance.client.chat.completions.create(
	model=eval_instance.model,
	temperature=eval_instance.temperature,
	max_tokens=eval_instance.max_tokens,
	messages=[
	{
	"role": "system",
	"content": (
	f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}."
	)
	},
	{
	"role": "user",
	"content": "What's today's date?"
	},
	{
	"role": "assistant",
	"content": f"Today is {current_date} in Pacific Standard Time."
	},
	{
	"role": "user",
	"content": prompt
	}
	],
	)
	return response.choices[0].message.content
	else:
	# Completions API
	response = eval_instance.client.completions.create(
	model=eval_instance.model,
	temperature=eval_instance.temperature,
	max_tokens=eval_instance.max_tokens,
	prompt=prompt,
	)
	return response.choices[0].text

	# 현재 키로 시작
	current_key = self.api_key
	current_instance = FreshEval(model=self.model, api_key=current_key)

	# 키가 1개인 경우: 기존 백오프 로직만 사용
	if num_keys == 1:
	max_retries = 7
	for attempt in range(max_retries):
	try:
	return _make_api_call(current_instance)
	except Exception as e:
	if self._is_rate_limit_error(e):
	if attempt < max_retries - 1:
	# 지수적 백오프
	delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
	time.sleep(delay)
	continue
	# else:
	# print(f"❌ 최대 재시도 횟수 초과")
	raise e

	# max_retries 초과할 때까지 return 되지 않으면 에러 발생
	raise Exception("call llm api:최대 재시도 횟수 초과")

	# 키가 2개 이상인 경우: 키 전환 로직 (3초 대기 포함)
	# 성공할 때까지 키를 순환하며 시도 (최대 모든 키를 3바퀴까지)
	max_attempts = num_keys * 3 # 모든 키를 최대 3바퀴까지 시도
	key_attempt_count = 0

	# 현재 키로 첫 시도
	for attempt in range(max_attempts):
	try:
	return _make_api_call(current_instance) # 성공하면 즉시 반환
	except Exception as e:
	if self._is_rate_limit_error(e):
	key_attempt_count += 1
	# 다음 키로 전환하기 전에 2초 대기
	time.sleep(2)
	current_key = rotator.pick_key()
	# print("🔑 키 전환")
	current_instance = FreshEval(model=self.model, api_key=current_key)
	continue # 다음 키로 계속 시도
	else:
	# 429가 아닌 에러는 즉시 전파
	raise

	# 최대 시도 횟수 초과 (모든 키를 여러 바퀴 시도했지만 모두 실패)
	raise Exception(f"모든 API 키에서 429 에러 발생 (최대 {max_attempts}회 시도)")


	def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str:
	"""FreshEval 평가 함수"""

	fresheval_question = f'\nquestion: {question}{evaluation}'

	# 환경변수 기반 프롬프트(본체: prefix + demo) 우선 사용
	env_prompt_body = None
	if mode == 'Relaxed':
	env_prompt_body = Config.FRESHQA_PROMPT_RELAXED
	elif mode == 'Strict':
	env_prompt_body = Config.FRESHQA_PROMPT_STRICT

	if env_prompt_body and str(env_prompt_body).strip():
	base_prompt = str(env_prompt_body).strip()
	else:
	raise ValueError(f"{mode} 평가 프롬프트 설정이 없습니다.")

	fresheval_prompt = base_prompt + fresheval_question

	# 평가
	answer = self.call_llm_api(fresheval_prompt, current_date)

	return answer


	def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]:
	"""평가 결과에서 등급 추출"""
	def _clean(text: str) -> str:
	# 양끝 장식/공백 제거 + 내부 흔적 정리 + 소문자화
	text = re.sub(r'^[`_~\s]+\|[`_~\s]+$', '', text)
	text = re.sub(r'[*`_~]', '', text)
	return text.strip().strip('.').strip().lower()

	def _judge(val: str):
	"""
	문자열에서 correct/incorrect 판정.
	- 'incorrect'가 보이면 무조건 FALSE
	- 'partially correct'는 모호 → None
	- 'correct'는 TRUE
	"""
	if re.search(r'(?i)\bincorrect\b', val):
	return 'FALSE'
	if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val):
	return None
	if re.search(r'(?i)\bcorrect\b', val):
	return 'TRUE'
	return None

	def _from_label(block_label: str):
	"""
	라벨(예: 'Final Evaluation' 또는 'Evaluation') 기준으로
	- 같은 줄 캡처 먼저 시도
	- 실패하면 라벨 이후 ~ 다음 빈 줄 이전 범위에서 판정 키워드 탐색
	"""
	# 같은 줄 캡처: 라벨 ± 장식 ± 콜론 이후 ~ 줄끝
	same_line = re.search(
	rf'(?i){block_label}\s(?:[`_~]\s:\s\|:\s[`_~])\s*([^\r\n]+)',
	response
	)
	if same_line:
	val = _clean(same_line.group(1))
	j = _judge(val)
	if j is not None:
	return j

	# 위치만 찾고(값 없이 줄바꿈된 케이스), 다음 빈 줄(or 섹션) 전까지 스캔
	pos = re.search(
	rf'(?i){block_label}\s(?:[`_~]\s:\s\|:\s[`_~])',
	response
	)
	if pos:
	tail = response[pos.end():]
	# 다음 '빈 줄(연속 개행)' 또는 다음 섹션 시작 전까지만 본다 (너무 멀리 안가도록)
	m_stop = re.search(r'\n\s*\n', tail)
	segment = tail[:m_stop.start()] if m_stop else tail[:300] # 안전한 상한
	seg_clean = _clean(segment)
	j = _judge(seg_clean)
	if j is not None:
	return j
	return None

	# 1) Final Evaluation 최우선
	final_judgement = _from_label('final\s+evaluation')
	if final_judgement:
	return True, {'rating': final_judgement}

	# 2) Evaluation
	eval_judgement = _from_label('evaluation')
	if eval_judgement:
	return True, {'rating': eval_judgement}

	# 3) 폴백: credited 문장
	if re.search(r'(?i)thus,\sthe\sresponse\sis\scredited\b', response):
	return True, {'rating': 'TRUE'}
	if re.search(r'(?i)thus,\sthe\sresponse\sis\snot\s*credited\b', response):
	return True, {'rating': 'FALSE'}

	# 4) 실패
	return False, {'rating': None}


	def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]:
	"""단일 행 평가"""
	question = row['question']
	response = row['model_response']
	correct_answers = [row[f'answer_{i}'] for i in range(10)]
	correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()]


	# model_response가 비어있거나 NaN인 경우 바로 틀렸다는 결과로 처리하고 return
	if pd.isna(response) or (isinstance(response, str) and response.strip() == ''):
	# print('model_response가 비어있음. rating=0으로 처리')
	row_dict = row.to_dict()
	row_dict['rating'] = 0
	row_dict['explanation'] = "model_response가 비어있음"
	return row_dict

	# 평가 템플릿 생성
	evaluation_template = (
	"\ncorrect answer(s): {correct_answers}"
	"\nresponse: {response}"
	"\ncomment: "
	)
	evaluation = evaluation_template.format(
	correct_answers=' \| '.join(correct_answers),
	response=response,
	)

	# 평가
	fresheval_response = self.call_fresheval(
	mode=mode,
	question=question,
	evaluation=evaluation,
	current_date=current_date
	)

	is_valid_eval, eval_result = self.extract_ratings(fresheval_response)

	# if is_valid_eval:
	# print('완료')

	# 재평가 횟수 제한 (최대 5회)
	max_retries = 5
	retry_count = 0

	# 재시도 loop
	while not is_valid_eval and retry_count < max_retries:
	retry_count += 1
	# print(f'유효하지 않은 평가, 재평가 중... ({retry_count}/{max_retries})\n response: {fresheval_response}')

	fresheval_response = self.call_fresheval(
	mode=mode,
	question=question,
	evaluation=evaluation,
	current_date=current_date
	)

	is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
	# if is_valid_eval:
	# print('완료')

	# 최대 재시도 횟수 초과 시 기본 값 사용
	if not is_valid_eval:
	# print(f'⚠️ 최대 재시도 횟수({max_retries}) 초과. 기본값 사용: rating=0')
	eval_result = {'rating': 0}
	fresheval_response = "재시도 횟수 초과로 인한 기본 평가"

	row_dict = row.to_dict()
	row_dict['rating'] = eval_result['rating']
	row_dict['explanation'] = fresheval_response

	# 📊 DEBUG: FALSE인 경우에만 상세 출력
	# if eval_result['rating'] == 'FALSE':
	# print(f"\n{'='*80}")
	# print(f"❌ FALSE 평가된 질문")
	# print(f" Mode: {mode}")
	# print(f" Question: {question}")
	# print(f" Correct Answers: {' \| '.join(correct_answers)}")
	# print(f" Model Response: {response}")
	# print(f"\n LLM 평가 응답:")
	# print(f" {fresheval_response}")
	# print(f" 최종 Rating: {eval_result['rating']}")
	# print(f"{'='*80}\n")

	return row_dict


	def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame:
	"""데이터프레임 평가"""

	freshevals = []
	current_date = get_current_date_str()

	len_df = len(df)
	for index, row in df.iterrows():
	print(f'{mode} 평가 중... {index+1}/{len_df}')
	row_dict = self.evaluate_single_row(row, mode, current_date)
	freshevals.append(row_dict)

	return pd.DataFrame(freshevals)