Spaces:

harismlnaslm
/

textilindo-ai-assistant

Build error

App Files Files Community

textilindo-ai-assistant / app_fixed.py

harismlnaslm

Add fixed app and dataset loading script

ef903ff 19 days ago

raw

history blame contribute delete

9.77 kB

	#!/usr/bin/env python3
	"""
	Textilindo AI Assistant - Hugging Face Spaces
	"""

	from flask import Flask, request, jsonify, render_template
	import os
	import json
	import requests
	from difflib import SequenceMatcher
	import logging

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = Flask(__name__)

	def load_system_prompt(default_text):
	"""Load system prompt from configs/system_prompt.md if available"""
	try:
	base_dir = os.path.dirname(__file__)
	md_path = os.path.join(base_dir, 'configs', 'system_prompt.md')
	if not os.path.exists(md_path):
	return default_text
	with open(md_path, 'r', encoding='utf-8') as f:
	content = f.read()
	start = content.find('"""')
	end = content.rfind('"""')
	if start != -1 and end != -1 and end > start:
	return content[start+3:end].strip()
	lines = []
	for line in content.splitlines():
	if line.strip().startswith('#'):
	continue
	lines.append(line)
	cleaned = '\n'.join(lines).strip()
	return cleaned or default_text
	except Exception:
	return default_text

	class TextilindoAI:
	def __init__(self):
	self.system_prompt = os.getenv(
	'SYSTEM_PROMPT',
	load_system_prompt("You are Textilindo AI Assistant. Be concise, helpful, and use Indonesian.")
	)
	self.dataset = self.load_all_datasets()

	def load_all_datasets(self):
	"""Load all available datasets"""
	dataset = []

	# Try multiple possible data directory paths
	possible_data_dirs = [
	"data",
	"./data",
	"/app/data",
	os.path.join(os.path.dirname(__file__), "data")
	]

	data_dir = None
	for dir_path in possible_data_dirs:
	if os.path.exists(dir_path):
	data_dir = dir_path
	logger.info(f"Found data directory: {data_dir}")
	break

	if not data_dir:
	logger.warning("No data directory found in any of the expected locations")
	return dataset

	# Load all JSONL files
	try:
	for filename in os.listdir(data_dir):
	if filename.endswith('.jsonl'):
	filepath = os.path.join(data_dir, filename)
	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if line:
	try:
	data = json.loads(line)
	dataset.append(data)
	except json.JSONDecodeError as e:
	logger.warning(f"Invalid JSON in {filename} line {line_num}: {e}")
	continue
	logger.info(f"Loaded {filename}: {len([d for d in dataset if d.get('instruction')])} examples")
	except Exception as e:
	logger.error(f"Error loading {filename}: {e}")
	except Exception as e:
	logger.error(f"Error reading data directory {data_dir}: {e}")

	logger.info(f"Total examples loaded: {len(dataset)}")
	return dataset

	def find_relevant_context(self, user_query, top_k=3):
	"""Find most relevant examples from dataset"""
	if not self.dataset:
	return []

	scores = []
	for i, example in enumerate(self.dataset):
	instruction = example.get('instruction', '').lower()
	output = example.get('output', '').lower()
	query = user_query.lower()

	instruction_score = SequenceMatcher(None, query, instruction).ratio()
	output_score = SequenceMatcher(None, query, output).ratio()
	combined_score = (instruction_score * 0.7) + (output_score * 0.3)
	scores.append((combined_score, i))

	scores.sort(reverse=True)
	relevant_examples = []

	for score, idx in scores[:top_k]:
	if score > 0.1:
	relevant_examples.append(self.dataset[idx])

	return relevant_examples

	def create_context_prompt(self, user_query, relevant_examples):
	"""Create a prompt with relevant context"""
	if not relevant_examples:
	return user_query

	context_parts = []
	context_parts.append("Berikut adalah beberapa contoh pertanyaan dan jawaban tentang Textilindo:")
	context_parts.append("")

	for i, example in enumerate(relevant_examples, 1):
	instruction = example.get('instruction', '')
	output = example.get('output', '')
	context_parts.append(f"Contoh {i}:")
	context_parts.append(f"Pertanyaan: {instruction}")
	context_parts.append(f"Jawaban: {output}")
	context_parts.append("")

	context_parts.append("Berdasarkan contoh di atas, jawab pertanyaan berikut:")
	context_parts.append(f"Pertanyaan: {user_query}")
	context_parts.append("Jawaban:")

	return "\n".join(context_parts)

	def chat(self, message, max_tokens=300, temperature=0.7):
	"""Generate response using Hugging Face Spaces"""
	relevant_examples = self.find_relevant_context(message, 3)

	if relevant_examples:
	enhanced_prompt = self.create_context_prompt(message, relevant_examples)
	context_used = True
	else:
	enhanced_prompt = message
	context_used = False

	# For now, return a simple response
	# In production, this would call your HF Space inference endpoint
	response = f"Terima kasih atas pertanyaan Anda: {message}. Saya akan membantu Anda dengan informasi tentang Textilindo."

	return {
	"success": True,
	"response": response,
	"context_used": context_used,
	"relevant_examples_count": len(relevant_examples)
	}

	# Initialize AI
	ai = TextilindoAI()

	@app.route('/health', methods=['GET'])
	def health_check():
	"""Health check endpoint"""
	return jsonify({
	"status": "healthy",
	"service": "Textilindo AI Assistant",
	"dataset_loaded": len(ai.dataset) > 0,
	"dataset_size": len(ai.dataset)
	})

	@app.route('/chat', methods=['POST'])
	def chat():
	"""Main chat endpoint"""
	try:
	data = request.get_json()

	if not data:
	return jsonify({
	"success": False,
	"error": "No JSON data provided"
	}), 400

	message = data.get('message', '').strip()
	if not message:
	return jsonify({
	"success": False,
	"error": "Message is required"
	}), 400

	# Optional parameters
	max_tokens = data.get('max_tokens', 300)
	temperature = data.get('temperature', 0.7)

	# Process chat
	result = ai.chat(message, max_tokens, temperature)

	if result["success"]:
	return jsonify(result)
	else:
	return jsonify(result), 500

	except Exception as e:
	logger.error(f"Error in chat endpoint: {e}")
	return jsonify({
	"success": False,
	"error": f"Internal server error: {str(e)}"
	}), 500

	@app.route('/stats', methods=['GET'])
	def get_stats():
	"""Get dataset and system statistics"""
	try:
	topics = {}
	for example in ai.dataset:
	metadata = example.get('metadata', {})
	topic = metadata.get('topic', 'unknown')
	topics[topic] = topics.get(topic, 0) + 1

	return jsonify({
	"success": True,
	"dataset": {
	"total_examples": len(ai.dataset),
	"topics": topics,
	"topics_count": len(topics)
	},
	"system": {
	"api_version": "1.0.0",
	"status": "operational"
	}
	})

	except Exception as e:
	logger.error(f"Error in stats endpoint: {e}")
	return jsonify({
	"success": False,
	"error": f"Internal server error: {str(e)}"
	}), 500

	@app.route('/', methods=['GET'])
	def root():
	"""API root endpoint with documentation"""
	return jsonify({
	"service": "Textilindo AI Assistant",
	"version": "1.0.0",
	"description": "AI-powered customer service for Textilindo",
	"endpoints": {
	"GET /": "API documentation (this endpoint)",
	"GET /health": "Health check",
	"POST /chat": "Chat with AI",
	"GET /stats": "Dataset and system statistics"
	},
	"usage": {
	"chat": {
	"method": "POST",
	"url": "/chat",
	"body": {
	"message": "string (required)",
	"max_tokens": "integer (optional, default: 300)",
	"temperature": "float (optional, default: 0.7)"
	}
	}
	},
	"dataset_size": len(ai.dataset)
	})

	if __name__ == '__main__':
	logger.info("Starting Textilindo AI Assistant...")
	logger.info(f"Dataset loaded: {len(ai.dataset)} examples")

	app.run(
	debug=False,
	host='0.0.0.0',
	port=8080
	)