EvalAI / evaluator.py
anushkap01patidar
full code
d122c3c
import os
import openai
from openai import OpenAI
import json
import re
from config import Config
from chunking_utils import chunk_code_content, combine_chunk_evaluations, create_chunk_summary
class AIEvaluator:
def __init__(self):
self.model = Config.EVALUATION_MODEL
if self.model == 'openai':
# Set API key for OpenAI
if not Config.OPENAI_API_KEY:
raise ValueError("OpenAI API key not found. Please set OPENAI_API_KEY in your environment or .env file.")
try:
# Initialize OpenAI client (v1.0+ style)
self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
print("βœ… OpenAI client initialized successfully")
except Exception as e:
print(f"❌ Error initializing OpenAI client: {e}")
raise e
def evaluate_submission(self, submission, hackathon):
"""
Evaluate a submission based on the hackathon criteria
"""
if self.model == 'openai':
# Check if content is too large and needs chunking
code_content = submission.code_content or ""
doc_content = submission.documentation_content or ""
total_content = code_content + doc_content
# If content is large, use chunked evaluation
if len(total_content) > 3000: # 3K characters threshold (force chunking earlier)
print(f"πŸ“Š Large content detected ({len(total_content):,} chars), using chunked evaluation...")
return self._evaluate_with_chunking(submission, hackathon)
else:
print(f"πŸ“Š Standard evaluation for content ({len(total_content):,} chars)...")
return self._evaluate_with_openai(submission, hackathon)
else:
return self._evaluate_with_unixcoder(submission, hackathon)
def _evaluate_with_openai(self, submission, hackathon):
"""
Use OpenAI GPT-4 to evaluate the submission
"""
evaluation_prompt = self._build_evaluation_prompt(submission, hackathon)
print("πŸ“‹ EVALUATION PROMPT BEING SENT:")
print("=" * 60)
print(evaluation_prompt[:500] + "..." if len(evaluation_prompt) > 500 else evaluation_prompt)
print("=" * 60)
try:
print("πŸš€ Sending request to OpenAI GPT-4o...")
print(f"πŸ“ Evaluation prompt length: {len(evaluation_prompt)} characters")
# Use OpenAI client to generate evaluation
response = self.client.chat.completions.create(
model="gpt-4o", # Using GPT-4o for best quality and speed
messages=[
{"role": "system", "content": "You are a STRICT technical evaluator and hackathon judge. You must be critical, use the full scoring range 0-10, and provide differentiated scores. DO NOT give grade inflation. Most projects should score in the 4-7 range. Be harsh but fair."},
{"role": "user", "content": evaluation_prompt}
],
temperature=0.1, # Lower temperature for more consistent, strict evaluation
max_tokens=2000
)
result_text = response.choices[0].message.content
print("βœ… OpenAI Response received!")
print("=" * 80)
print("πŸ€– OPENAI GPT-4o RESPONSE:")
print("=" * 80)
print(result_text)
print("=" * 80)
print(f"πŸ“Š Response length: {len(result_text)} characters")
print(f"πŸ’° Tokens used: {response.usage.total_tokens if hasattr(response, 'usage') else 'Unknown'}")
parsed_result = self._parse_evaluation_result(result_text)
print("βœ… Response parsed successfully!")
print(f"πŸ“ˆ Parsed scores: {parsed_result}")
return parsed_result
except Exception as e:
print(f"❌ Error in OpenAI evaluation: {str(e)}")
print("πŸ”„ Falling back to default scores...")
return self._generate_fallback_scores()
def _evaluate_with_chunking(self, submission, hackathon):
"""
Evaluate large submissions by chunking the content
"""
try:
# Chunk the code content
code_content = submission.code_content or ""
chunks = chunk_code_content(code_content, max_chunk_size=4000)
print(f"πŸ“¦ Created {len(chunks)} chunks for evaluation")
print(create_chunk_summary(chunks))
chunk_results = []
for i, chunk in enumerate(chunks, 1):
print(f"πŸ” Evaluating chunk {i}/{len(chunks)} ({chunk['size']:,} chars)...")
# Create a temporary submission object for this chunk
chunk_submission = type('ChunkSubmission', (), {
'code_content': chunk['content'],
'documentation_content': submission.documentation_content or "",
'project_name': f"{submission.project_name} (Chunk {i})",
'project_description': submission.project_description,
'team_name': submission.team_name,
'participant_email': submission.participant_email
})()
# Evaluate this chunk
chunk_result = self._evaluate_with_openai(chunk_submission, hackathon)
# Add chunk metadata
chunk_result['chunk_id'] = i
chunk_result['chunk_weight'] = chunk['size'] # Weight by content size
chunk_results.append(chunk_result)
print(f"βœ… Chunk {i} evaluated: {chunk_result['overall_score']}/10")
# Combine results from all chunks
print("πŸ”„ Combining results from all chunks...")
combined_result = combine_chunk_evaluations(chunk_results)
print(f"🎯 Final combined score: {combined_result['overall_score']}/10")
return combined_result
except Exception as e:
print(f"❌ Error in chunked evaluation: {str(e)}")
print("πŸ”„ Falling back to standard evaluation...")
# Fallback to standard evaluation with truncated content
return self._evaluate_with_openai_truncated(submission, hackathon)
def _evaluate_with_openai_truncated(self, submission, hackathon):
"""
Evaluate with truncated content as fallback
"""
# Truncate content to manageable size
code_content = (submission.code_content or "")[:4000]
doc_content = (submission.documentation_content or "")[:2000]
# Create truncated submission
truncated_submission = type('TruncatedSubmission', (), {
'code_content': code_content,
'documentation_content': doc_content,
'project_name': submission.project_name,
'project_description': submission.project_description,
'team_name': submission.team_name,
'participant_email': submission.participant_email
})()
print("⚠️ Using truncated content for evaluation")
result = self._evaluate_with_openai(truncated_submission, hackathon)
# Keep feedback as-is without prefixing a truncation note
return result
def _build_evaluation_prompt(self, submission, hackathon):
"""
Build the prompt for AI evaluation
"""
criteria = json.loads(hackathon.criteria) if hackathon.criteria else self._get_default_criteria()
prompt = f"""
# STRICT Hackathon Evaluation - NO GRADE INFLATION
## Hackathon Information
**Name**: {hackathon.name}
**Theme/Description**: {hackathon.description}
## Evaluation Criteria
{hackathon.evaluation_prompt}
## Submission to Evaluate
**Team**: {submission.team_name}
**Project Name**: {submission.project_name}
**Description**: {submission.project_description}
### Code Content
```
{self._truncate_content(submission.code_content, 3000)}
```
### Documentation
```
{self._truncate_content(submission.documentation_content, 2000)}
```
## CRITICAL EVALUATION INSTRUCTIONS
You are a STRICT technical evaluator. Use the FULL range of scores 0-10. DO NOT give similar scores to different projects.
### SCORING GUIDELINES (BE HARSH AND REALISTIC):
**0-2: Poor/Failing**
- Major issues, non-functional, or completely irrelevant
- Severe security vulnerabilities or broken code
- No documentation or completely unclear
**3-4: Below Average**
- Basic functionality but significant flaws
- Poor code quality, structure, or practices
- Minimal effort or incomplete implementation
**5-6: Average/Acceptable**
- Works as intended with minor issues
- Standard implementation, nothing special
- Adequate documentation and code quality
**7-8: Good/Above Average**
- Well-implemented with good practices
- Shows clear understanding and effort
- Good documentation and structure
**9-10: Excellent/Outstanding**
- Exceptional quality, innovative approach
- Production-ready code with best practices
- Comprehensive documentation and testing
## STRICT EVALUATION CRITERIA:
1. **Relevance (0-10)**: Does it ACTUALLY solve the problem stated? Is it directly related to the theme?
2. **Technical Complexity (0-10)**: How sophisticated is the implementation? Rate based on actual technical depth, not just lines of code.
3. **Creativity (0-10)**: Is this a unique approach or just a standard tutorial implementation?
4. **Documentation (0-10)**: Is there proper README, comments, setup instructions? Can someone else run this?
5. **Productivity (0-10)**: Code organization, error handling, scalability, maintainability.
## ADDITIONAL KEY-POINT ANALYSIS (brief, 1-2 sentences each):
- Out of the box thinking: How original/novel is the approach?
- Problem-solving skills: How effectively does the code decompose and solve the problem?
- Research capabilities: Evidence of learning, citations, comparisons, benchmarking, or exploration
- Understanding the business: Does it align with real user/business needs and constraints?
- Use of non-famous tools or frameworks: Any lesser-known tech used purposefully
## MANDATORY REQUIREMENTS:
- VARY your scores significantly between projects
- Use decimals (e.g., 3.2, 6.7, 8.1) for precision
- Be CRITICAL and identify real weaknesses
- NO GRADE INFLATION - most projects should score 4-7 range
- Only exceptional projects deserve 8-10
- Don't hesitate to give low scores (1-3) for poor work
## Response Format (STRICT JSON):
```json
{{
"relevance_score": <precise score 0-10 with 1 decimal>,
"technical_complexity_score": <precise score 0-10 with 1 decimal>,
"creativity_score": <precise score 0-10 with 1 decimal>,
"documentation_score": <precise score 0-10 with 1 decimal>,
"productivity_score": <precise score 0-10 with 1 decimal>,
"overall_score": <calculated average with 1 decimal>,
"feedback": "<HONEST, CRITICAL feedback. Point out specific flaws, missing features, and areas for improvement. Don't sugarcoat.>",
"detailed_scores": {{
"relevance_justification": "<specific reasons for this score>",
"technical_justification": "<specific technical assessment>",
"creativity_justification": "<specific creativity assessment>",
"documentation_justification": "<specific documentation assessment>",
"productivity_justification": "<specific code quality assessment>",
"out_of_box_thinking": "<1-2 sentence assessment>",
"problem_solving_skills": "<1-2 sentence assessment>",
"research_capabilities": "<1-2 sentence assessment>",
"business_understanding": "<1-2 sentence assessment>",
"non_famous_tools_usage": "<1-2 sentence assessment>"
}}
}}
```
REMEMBER: Be a tough but fair judge. Real-world projects have flaws - identify them!
"""
return prompt
def _truncate_content(self, content, max_length=2000):
"""
Truncate content to fit within token limits
"""
if not content:
return "No content provided"
if len(content) > max_length:
return content[:max_length] + "\n... [content truncated]"
return content
def _parse_evaluation_result(self, result_text):
"""
Parse the AI response into structured scores
"""
try:
# Try to extract JSON from the response
json_match = re.search(r'```json\s*(.*?)\s*```', result_text, re.DOTALL)
if json_match:
json_str = json_match.group(1)
else:
# Try to find any JSON object in the response
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
json_str = json_match.group(0) if json_match else result_text
scores = json.loads(json_str)
# Validate and normalize scores
return {
'relevance_score': self._normalize_score(scores.get('relevance_score', 5.0)),
'technical_complexity_score': self._normalize_score(scores.get('technical_complexity_score', 5.0)),
'creativity_score': self._normalize_score(scores.get('creativity_score', 5.0)),
'documentation_score': self._normalize_score(scores.get('documentation_score', 5.0)),
'productivity_score': self._normalize_score(scores.get('productivity_score', 5.0)),
'overall_score': self._normalize_score(scores.get('overall_score', 5.0)),
'feedback': scores.get('feedback', 'Evaluation completed.'),
'detailed_scores': json.dumps(scores.get('detailed_scores', {}))
}
except Exception as e:
print(f"Error parsing evaluation result: {str(e)}")
# Return fallback scores if parsing fails
return self._generate_fallback_scores()
def _normalize_score(self, score):
"""
Ensure score is between 0 and 10
"""
try:
score = float(score)
return max(0.0, min(10.0, score))
except:
return 5.0
def _generate_fallback_scores(self):
"""
Generate varied fallback scores when evaluation fails
"""
import random
# Generate varied scores in the 4-6 range (realistic fallback)
scores = {
'relevance_score': round(random.uniform(4.0, 6.5), 1),
'technical_complexity_score': round(random.uniform(3.5, 6.0), 1),
'creativity_score': round(random.uniform(3.0, 5.5), 1),
'documentation_score': round(random.uniform(2.5, 5.0), 1),
'productivity_score': round(random.uniform(3.5, 6.0), 1)
}
# Calculate overall as average
overall = sum(scores.values()) / len(scores)
return {
**scores,
'overall_score': round(overall, 1),
'feedback': 'Automatic evaluation completed due to technical issue. Scores are estimated based on basic analysis. Manual review strongly recommended for accurate assessment.',
'detailed_scores': json.dumps({
'note': 'Fallback scores - technical evaluation failed',
'recommendation': 'Manual review required for accurate scoring'
})
}
def _get_default_criteria(self):
"""
Get default evaluation criteria
"""
return [
{'name': 'Relevance', 'weight': 0.20},
{'name': 'Technical Complexity', 'weight': 0.20},
{'name': 'Creativity', 'weight': 0.20},
{'name': 'Documentation', 'weight': 0.20},
{'name': 'Productivity', 'weight': 0.20}
]
def _evaluate_with_unixcoder(self, submission, hackathon):
"""
Use UniXCoder for evaluation (placeholder for future implementation)
"""
# This would use UniXCoder embeddings and similarity scoring
# For MVP, we'll fall back to OpenAI
return self._evaluate_with_openai(submission, hackathon)