Tasks: - image_generate: Generate image from prompt - image_variant: Generate variant of existing image - image_upscale: Increase resolution Models: SDXL, Flux, SDXL-Turbo RunPod Serverless Handler
234 lines
7.4 KiB
Python
234 lines
7.4 KiB
Python
"""
|
|
=============================================================================
|
|
THE FACTORY - Evaluator
|
|
=============================================================================
|
|
Evalúa artefactos generados vs el objetivo.
|
|
Proporciona:
|
|
- confidence: 0.0 a 1.0
|
|
- feedback: sugerencias de mejora
|
|
=============================================================================
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, Optional
|
|
|
|
import anthropic
|
|
|
|
from config import FactoryConfig, FunctionType
|
|
|
|
logger = logging.getLogger("factory.evaluator")
|
|
|
|
|
|
class Evaluator:
|
|
"""
|
|
El Evaluator evalúa artefactos contra el objetivo.
|
|
"""
|
|
|
|
def __init__(self, config: FactoryConfig):
|
|
self.config = config
|
|
|
|
if config.anthropic_api_key:
|
|
self.anthropic = anthropic.Anthropic(api_key=config.anthropic_api_key)
|
|
else:
|
|
self.anthropic = None
|
|
logger.warning("ANTHROPIC_API_KEY no configurada para Evaluator")
|
|
|
|
def evaluate(
|
|
self,
|
|
artifact: Any,
|
|
objective: str,
|
|
function: FunctionType
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Evalúa un artefacto.
|
|
|
|
Args:
|
|
artifact: El artefacto generado
|
|
objective: El objetivo a cumplir
|
|
function: Tipo de función
|
|
|
|
Returns:
|
|
{
|
|
"confidence": 0.0-1.0,
|
|
"feedback": "sugerencias de mejora",
|
|
"strengths": ["punto fuerte 1", ...],
|
|
"weaknesses": ["punto débil 1", ...],
|
|
"cost_usd": coste de evaluación
|
|
}
|
|
"""
|
|
|
|
if not self.anthropic:
|
|
# Fallback: evaluación básica sin LLM
|
|
return self._evaluate_basic(artifact, objective, function)
|
|
|
|
return self._evaluate_with_llm(artifact, objective, function)
|
|
|
|
def _evaluate_with_llm(
|
|
self,
|
|
artifact: Any,
|
|
objective: str,
|
|
function: FunctionType
|
|
) -> Dict[str, Any]:
|
|
"""Evalúa usando Claude Haiku."""
|
|
|
|
model = self.config.get_model(self.config.evaluator_model)
|
|
|
|
# Preparar artefacto para evaluación
|
|
artifact_str = self._prepare_artifact_for_eval(artifact, function)
|
|
|
|
prompt = f"""Evalúa el siguiente artefacto contra el objetivo especificado.
|
|
|
|
OBJETIVO:
|
|
{objective}
|
|
|
|
ARTEFACTO A EVALUAR:
|
|
{artifact_str}
|
|
|
|
Proporciona tu evaluación en formato JSON con esta estructura exacta:
|
|
{{
|
|
"confidence": <número entre 0.0 y 1.0>,
|
|
"feedback": "<sugerencias específicas de mejora>",
|
|
"strengths": ["<punto fuerte 1>", "<punto fuerte 2>"],
|
|
"weaknesses": ["<punto débil 1>", "<punto débil 2>"],
|
|
"meets_objective": <true/false>
|
|
}}
|
|
|
|
Criterios de evaluación:
|
|
- 0.9-1.0: Excelente, cumple completamente el objetivo
|
|
- 0.7-0.9: Bueno, cumple mayormente con mejoras menores posibles
|
|
- 0.5-0.7: Aceptable, cumple parcialmente
|
|
- 0.3-0.5: Deficiente, necesita mejoras significativas
|
|
- 0.0-0.3: Inaceptable, no cumple el objetivo
|
|
|
|
Sé específico en el feedback para que el siguiente intento pueda mejorar."""
|
|
|
|
try:
|
|
response = self.anthropic.messages.create(
|
|
model=model.name,
|
|
max_tokens=1024,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
# Extraer texto
|
|
response_text = ""
|
|
for block in response.content:
|
|
if hasattr(block, "text"):
|
|
response_text += block.text
|
|
|
|
# Parsear JSON
|
|
eval_result = self._parse_eval_response(response_text)
|
|
|
|
# Calcular coste
|
|
input_tokens = response.usage.input_tokens
|
|
output_tokens = response.usage.output_tokens
|
|
cost = (
|
|
(input_tokens / 1000) * model.cost_per_1k_input +
|
|
(output_tokens / 1000) * model.cost_per_1k_output
|
|
)
|
|
|
|
eval_result["cost_usd"] = cost
|
|
eval_result["tokens_used"] = input_tokens + output_tokens
|
|
|
|
logger.info(f"Evaluación: confidence={eval_result['confidence']:.2f}")
|
|
|
|
return eval_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error en evaluación LLM: {e}")
|
|
return self._evaluate_basic(artifact, objective, function)
|
|
|
|
def _parse_eval_response(self, response: str) -> Dict[str, Any]:
|
|
"""Parsea respuesta de evaluación."""
|
|
|
|
# Intentar extraer JSON
|
|
try:
|
|
# Buscar JSON en la respuesta
|
|
start = response.find("{")
|
|
end = response.rfind("}") + 1
|
|
|
|
if start >= 0 and end > start:
|
|
json_str = response[start:end]
|
|
result = json.loads(json_str)
|
|
|
|
# Validar campos requeridos
|
|
confidence = float(result.get("confidence", 0.5))
|
|
confidence = max(0.0, min(1.0, confidence)) # Clamp 0-1
|
|
|
|
return {
|
|
"confidence": confidence,
|
|
"feedback": result.get("feedback", "Sin feedback específico"),
|
|
"strengths": result.get("strengths", []),
|
|
"weaknesses": result.get("weaknesses", []),
|
|
"meets_objective": result.get("meets_objective", confidence >= 0.7)
|
|
}
|
|
|
|
except (json.JSONDecodeError, ValueError) as e:
|
|
logger.warning(f"Error parseando evaluación: {e}")
|
|
|
|
# Fallback
|
|
return {
|
|
"confidence": 0.5,
|
|
"feedback": "No se pudo evaluar correctamente. Intenta de nuevo.",
|
|
"strengths": [],
|
|
"weaknesses": ["Evaluación fallida"],
|
|
"meets_objective": False
|
|
}
|
|
|
|
def _evaluate_basic(
|
|
self,
|
|
artifact: Any,
|
|
objective: str,
|
|
function: FunctionType
|
|
) -> Dict[str, Any]:
|
|
"""Evaluación básica sin LLM."""
|
|
|
|
artifact_str = str(artifact) if artifact else ""
|
|
|
|
# Heurísticas simples
|
|
confidence = 0.5
|
|
|
|
# Verificar longitud mínima
|
|
if len(artifact_str) < 50:
|
|
confidence -= 0.2
|
|
elif len(artifact_str) > 200:
|
|
confidence += 0.1
|
|
|
|
# Verificar que contiene palabras del objetivo
|
|
objective_words = set(objective.lower().split())
|
|
artifact_words = set(artifact_str.lower().split())
|
|
overlap = len(objective_words & artifact_words)
|
|
|
|
if overlap >= len(objective_words) * 0.3:
|
|
confidence += 0.2
|
|
|
|
confidence = max(0.0, min(1.0, confidence))
|
|
|
|
return {
|
|
"confidence": confidence,
|
|
"feedback": "Evaluación básica aplicada. Considera revisar manualmente.",
|
|
"strengths": ["Artefacto generado"],
|
|
"weaknesses": ["Evaluación automática limitada"],
|
|
"meets_objective": confidence >= 0.7,
|
|
"cost_usd": 0.0
|
|
}
|
|
|
|
def _prepare_artifact_for_eval(self, artifact: Any, function: FunctionType) -> str:
|
|
"""Prepara artefacto para evaluación."""
|
|
|
|
if artifact is None:
|
|
return "[No artifact generated]"
|
|
|
|
if function == FunctionType.IMAGE_GENERATION:
|
|
if isinstance(artifact, dict):
|
|
return f"[Image generated]\nURL: {artifact.get('url', 'N/A')}\nPrompt used: {artifact.get('prompt', 'N/A')}"
|
|
return "[Image generated]"
|
|
|
|
artifact_str = str(artifact)
|
|
|
|
# Truncar si es muy largo
|
|
if len(artifact_str) > 4000:
|
|
return artifact_str[:4000] + "\n[...truncado para evaluación...]"
|
|
|
|
return artifact_str
|