Spaces:
Configuration error
Configuration error
File size: 16,560 Bytes
d122c3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 |
import os
import openai
from openai import OpenAI
import json
import re
from config import Config
from chunking_utils import chunk_code_content, combine_chunk_evaluations, create_chunk_summary
class AIEvaluator:
def __init__(self):
self.model = Config.EVALUATION_MODEL
if self.model == 'openai':
# Set API key for OpenAI
if not Config.OPENAI_API_KEY:
raise ValueError("OpenAI API key not found. Please set OPENAI_API_KEY in your environment or .env file.")
try:
# Initialize OpenAI client (v1.0+ style)
self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
print("β
OpenAI client initialized successfully")
except Exception as e:
print(f"β Error initializing OpenAI client: {e}")
raise e
def evaluate_submission(self, submission, hackathon):
"""
Evaluate a submission based on the hackathon criteria
"""
if self.model == 'openai':
# Check if content is too large and needs chunking
code_content = submission.code_content or ""
doc_content = submission.documentation_content or ""
total_content = code_content + doc_content
# If content is large, use chunked evaluation
if len(total_content) > 3000: # 3K characters threshold (force chunking earlier)
print(f"π Large content detected ({len(total_content):,} chars), using chunked evaluation...")
return self._evaluate_with_chunking(submission, hackathon)
else:
print(f"π Standard evaluation for content ({len(total_content):,} chars)...")
return self._evaluate_with_openai(submission, hackathon)
else:
return self._evaluate_with_unixcoder(submission, hackathon)
def _evaluate_with_openai(self, submission, hackathon):
"""
Use OpenAI GPT-4 to evaluate the submission
"""
evaluation_prompt = self._build_evaluation_prompt(submission, hackathon)
print("π EVALUATION PROMPT BEING SENT:")
print("=" * 60)
print(evaluation_prompt[:500] + "..." if len(evaluation_prompt) > 500 else evaluation_prompt)
print("=" * 60)
try:
print("π Sending request to OpenAI GPT-4o...")
print(f"π Evaluation prompt length: {len(evaluation_prompt)} characters")
# Use OpenAI client to generate evaluation
response = self.client.chat.completions.create(
model="gpt-4o", # Using GPT-4o for best quality and speed
messages=[
{"role": "system", "content": "You are a STRICT technical evaluator and hackathon judge. You must be critical, use the full scoring range 0-10, and provide differentiated scores. DO NOT give grade inflation. Most projects should score in the 4-7 range. Be harsh but fair."},
{"role": "user", "content": evaluation_prompt}
],
temperature=0.1, # Lower temperature for more consistent, strict evaluation
max_tokens=2000
)
result_text = response.choices[0].message.content
print("β
OpenAI Response received!")
print("=" * 80)
print("π€ OPENAI GPT-4o RESPONSE:")
print("=" * 80)
print(result_text)
print("=" * 80)
print(f"π Response length: {len(result_text)} characters")
print(f"π° Tokens used: {response.usage.total_tokens if hasattr(response, 'usage') else 'Unknown'}")
parsed_result = self._parse_evaluation_result(result_text)
print("β
Response parsed successfully!")
print(f"π Parsed scores: {parsed_result}")
return parsed_result
except Exception as e:
print(f"β Error in OpenAI evaluation: {str(e)}")
print("π Falling back to default scores...")
return self._generate_fallback_scores()
def _evaluate_with_chunking(self, submission, hackathon):
"""
Evaluate large submissions by chunking the content
"""
try:
# Chunk the code content
code_content = submission.code_content or ""
chunks = chunk_code_content(code_content, max_chunk_size=4000)
print(f"π¦ Created {len(chunks)} chunks for evaluation")
print(create_chunk_summary(chunks))
chunk_results = []
for i, chunk in enumerate(chunks, 1):
print(f"π Evaluating chunk {i}/{len(chunks)} ({chunk['size']:,} chars)...")
# Create a temporary submission object for this chunk
chunk_submission = type('ChunkSubmission', (), {
'code_content': chunk['content'],
'documentation_content': submission.documentation_content or "",
'project_name': f"{submission.project_name} (Chunk {i})",
'project_description': submission.project_description,
'team_name': submission.team_name,
'participant_email': submission.participant_email
})()
# Evaluate this chunk
chunk_result = self._evaluate_with_openai(chunk_submission, hackathon)
# Add chunk metadata
chunk_result['chunk_id'] = i
chunk_result['chunk_weight'] = chunk['size'] # Weight by content size
chunk_results.append(chunk_result)
print(f"β
Chunk {i} evaluated: {chunk_result['overall_score']}/10")
# Combine results from all chunks
print("π Combining results from all chunks...")
combined_result = combine_chunk_evaluations(chunk_results)
print(f"π― Final combined score: {combined_result['overall_score']}/10")
return combined_result
except Exception as e:
print(f"β Error in chunked evaluation: {str(e)}")
print("π Falling back to standard evaluation...")
# Fallback to standard evaluation with truncated content
return self._evaluate_with_openai_truncated(submission, hackathon)
def _evaluate_with_openai_truncated(self, submission, hackathon):
"""
Evaluate with truncated content as fallback
"""
# Truncate content to manageable size
code_content = (submission.code_content or "")[:4000]
doc_content = (submission.documentation_content or "")[:2000]
# Create truncated submission
truncated_submission = type('TruncatedSubmission', (), {
'code_content': code_content,
'documentation_content': doc_content,
'project_name': submission.project_name,
'project_description': submission.project_description,
'team_name': submission.team_name,
'participant_email': submission.participant_email
})()
print("β οΈ Using truncated content for evaluation")
result = self._evaluate_with_openai(truncated_submission, hackathon)
# Keep feedback as-is without prefixing a truncation note
return result
def _build_evaluation_prompt(self, submission, hackathon):
"""
Build the prompt for AI evaluation
"""
criteria = json.loads(hackathon.criteria) if hackathon.criteria else self._get_default_criteria()
prompt = f"""
# STRICT Hackathon Evaluation - NO GRADE INFLATION
## Hackathon Information
**Name**: {hackathon.name}
**Theme/Description**: {hackathon.description}
## Evaluation Criteria
{hackathon.evaluation_prompt}
## Submission to Evaluate
**Team**: {submission.team_name}
**Project Name**: {submission.project_name}
**Description**: {submission.project_description}
### Code Content
```
{self._truncate_content(submission.code_content, 3000)}
```
### Documentation
```
{self._truncate_content(submission.documentation_content, 2000)}
```
## CRITICAL EVALUATION INSTRUCTIONS
You are a STRICT technical evaluator. Use the FULL range of scores 0-10. DO NOT give similar scores to different projects.
### SCORING GUIDELINES (BE HARSH AND REALISTIC):
**0-2: Poor/Failing**
- Major issues, non-functional, or completely irrelevant
- Severe security vulnerabilities or broken code
- No documentation or completely unclear
**3-4: Below Average**
- Basic functionality but significant flaws
- Poor code quality, structure, or practices
- Minimal effort or incomplete implementation
**5-6: Average/Acceptable**
- Works as intended with minor issues
- Standard implementation, nothing special
- Adequate documentation and code quality
**7-8: Good/Above Average**
- Well-implemented with good practices
- Shows clear understanding and effort
- Good documentation and structure
**9-10: Excellent/Outstanding**
- Exceptional quality, innovative approach
- Production-ready code with best practices
- Comprehensive documentation and testing
## STRICT EVALUATION CRITERIA:
1. **Relevance (0-10)**: Does it ACTUALLY solve the problem stated? Is it directly related to the theme?
2. **Technical Complexity (0-10)**: How sophisticated is the implementation? Rate based on actual technical depth, not just lines of code.
3. **Creativity (0-10)**: Is this a unique approach or just a standard tutorial implementation?
4. **Documentation (0-10)**: Is there proper README, comments, setup instructions? Can someone else run this?
5. **Productivity (0-10)**: Code organization, error handling, scalability, maintainability.
## ADDITIONAL KEY-POINT ANALYSIS (brief, 1-2 sentences each):
- Out of the box thinking: How original/novel is the approach?
- Problem-solving skills: How effectively does the code decompose and solve the problem?
- Research capabilities: Evidence of learning, citations, comparisons, benchmarking, or exploration
- Understanding the business: Does it align with real user/business needs and constraints?
- Use of non-famous tools or frameworks: Any lesser-known tech used purposefully
## MANDATORY REQUIREMENTS:
- VARY your scores significantly between projects
- Use decimals (e.g., 3.2, 6.7, 8.1) for precision
- Be CRITICAL and identify real weaknesses
- NO GRADE INFLATION - most projects should score 4-7 range
- Only exceptional projects deserve 8-10
- Don't hesitate to give low scores (1-3) for poor work
## Response Format (STRICT JSON):
```json
{{
"relevance_score": <precise score 0-10 with 1 decimal>,
"technical_complexity_score": <precise score 0-10 with 1 decimal>,
"creativity_score": <precise score 0-10 with 1 decimal>,
"documentation_score": <precise score 0-10 with 1 decimal>,
"productivity_score": <precise score 0-10 with 1 decimal>,
"overall_score": <calculated average with 1 decimal>,
"feedback": "<HONEST, CRITICAL feedback. Point out specific flaws, missing features, and areas for improvement. Don't sugarcoat.>",
"detailed_scores": {{
"relevance_justification": "<specific reasons for this score>",
"technical_justification": "<specific technical assessment>",
"creativity_justification": "<specific creativity assessment>",
"documentation_justification": "<specific documentation assessment>",
"productivity_justification": "<specific code quality assessment>",
"out_of_box_thinking": "<1-2 sentence assessment>",
"problem_solving_skills": "<1-2 sentence assessment>",
"research_capabilities": "<1-2 sentence assessment>",
"business_understanding": "<1-2 sentence assessment>",
"non_famous_tools_usage": "<1-2 sentence assessment>"
}}
}}
```
REMEMBER: Be a tough but fair judge. Real-world projects have flaws - identify them!
"""
return prompt
def _truncate_content(self, content, max_length=2000):
"""
Truncate content to fit within token limits
"""
if not content:
return "No content provided"
if len(content) > max_length:
return content[:max_length] + "\n... [content truncated]"
return content
def _parse_evaluation_result(self, result_text):
"""
Parse the AI response into structured scores
"""
try:
# Try to extract JSON from the response
json_match = re.search(r'```json\s*(.*?)\s*```', result_text, re.DOTALL)
if json_match:
json_str = json_match.group(1)
else:
# Try to find any JSON object in the response
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
json_str = json_match.group(0) if json_match else result_text
scores = json.loads(json_str)
# Validate and normalize scores
return {
'relevance_score': self._normalize_score(scores.get('relevance_score', 5.0)),
'technical_complexity_score': self._normalize_score(scores.get('technical_complexity_score', 5.0)),
'creativity_score': self._normalize_score(scores.get('creativity_score', 5.0)),
'documentation_score': self._normalize_score(scores.get('documentation_score', 5.0)),
'productivity_score': self._normalize_score(scores.get('productivity_score', 5.0)),
'overall_score': self._normalize_score(scores.get('overall_score', 5.0)),
'feedback': scores.get('feedback', 'Evaluation completed.'),
'detailed_scores': json.dumps(scores.get('detailed_scores', {}))
}
except Exception as e:
print(f"Error parsing evaluation result: {str(e)}")
# Return fallback scores if parsing fails
return self._generate_fallback_scores()
def _normalize_score(self, score):
"""
Ensure score is between 0 and 10
"""
try:
score = float(score)
return max(0.0, min(10.0, score))
except:
return 5.0
def _generate_fallback_scores(self):
"""
Generate varied fallback scores when evaluation fails
"""
import random
# Generate varied scores in the 4-6 range (realistic fallback)
scores = {
'relevance_score': round(random.uniform(4.0, 6.5), 1),
'technical_complexity_score': round(random.uniform(3.5, 6.0), 1),
'creativity_score': round(random.uniform(3.0, 5.5), 1),
'documentation_score': round(random.uniform(2.5, 5.0), 1),
'productivity_score': round(random.uniform(3.5, 6.0), 1)
}
# Calculate overall as average
overall = sum(scores.values()) / len(scores)
return {
**scores,
'overall_score': round(overall, 1),
'feedback': 'Automatic evaluation completed due to technical issue. Scores are estimated based on basic analysis. Manual review strongly recommended for accurate assessment.',
'detailed_scores': json.dumps({
'note': 'Fallback scores - technical evaluation failed',
'recommendation': 'Manual review required for accurate scoring'
})
}
def _get_default_criteria(self):
"""
Get default evaluation criteria
"""
return [
{'name': 'Relevance', 'weight': 0.20},
{'name': 'Technical Complexity', 'weight': 0.20},
{'name': 'Creativity', 'weight': 0.20},
{'name': 'Documentation', 'weight': 0.20},
{'name': 'Productivity', 'weight': 0.20}
]
def _evaluate_with_unixcoder(self, submission, hackathon):
"""
Use UniXCoder for evaluation (placeholder for future implementation)
"""
# This would use UniXCoder embeddings and similarity scoring
# For MVP, we'll fall back to OpenAI
return self._evaluate_with_openai(submission, hackathon)
|