scriptsledge commited on
Commit
3845357
·
verified ·
1 Parent(s): f3e1db8

Upload 4 files

Browse files

feat: optimize inference engine with llama.cpp and Qwen 2.5 7B
- Switched to llama-cpp-python for high-performance CPU inference.
- Upgraded to Qwen 2.5 Coder 7B (GGUF 4-bit) for superior logic.
- Optimized context window (8192 tokens) for 16GB RAM environments.
- Implemented robust heuristic language detection for 20+ languages.
- Security: Added non-root user compliance for HF Spaces.

Files changed (3) hide show
  1. Dockerfile +9 -0
  2. model_service.py +145 -50
  3. requirements.txt +2 -3
Dockerfile CHANGED
@@ -7,6 +7,12 @@ WORKDIR /app
7
  # Copy the requirements file into the container at /app
8
  COPY requirements.txt .
9
 
 
 
 
 
 
 
10
  # Install dependencies
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
@@ -22,5 +28,8 @@ ENV HOME=/home/user \
22
  # Expose port 7860 (Hugging Face Spaces default)
23
  EXPOSE 7860
24
 
 
 
 
25
  # Run uvicorn when the container launches
26
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
7
  # Copy the requirements file into the container at /app
8
  COPY requirements.txt .
9
 
10
+ # Install system dependencies required for building llama-cpp-python
11
+ RUN apt-get update && apt-get install -y \
12
+ build-essential \
13
+ cmake \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
  # Install dependencies
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
 
28
  # Expose port 7860 (Hugging Face Spaces default)
29
  EXPOSE 7860
30
 
31
+ # Set context size for Hugging Face Spaces (Pure 16GB RAM)
32
+ ENV MODEL_CTX_SIZE=8192
33
+
34
  # Run uvicorn when the container launches
35
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
model_service.py CHANGED
@@ -1,102 +1,197 @@
1
  import os
2
- import torch
3
- from transformers import pipeline
4
 
5
  # --- Configuration ---
6
- # Switching to 3B model for faster download and inference as requested
7
- MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"
 
 
8
 
9
- print(f"Initializing Clarity AI Engine (Transformers Pipeline)...")
10
- print(f"Target Model: {MODEL_ID}")
11
 
12
- # Optimize for speed: use float16 if GPU is available
13
- dtype = torch.float16 if torch.cuda.is_available() else "auto"
14
-
15
- pipe = None
16
 
17
  try:
18
- print("Loading model pipeline...")
19
- # Using the exact pattern you provided
20
- pipe = pipeline(
21
- "text-generation",
22
- model=MODEL_ID,
23
- device_map="auto",
24
- torch_dtype=dtype
 
 
 
 
 
 
 
 
 
25
  )
26
  print("Success: Clarity AI Model loaded.")
27
 
28
  except Exception as e:
29
  print(f"CRITICAL ERROR: Failed to load model. {e}")
30
- pipe = None
31
 
32
  def detect_language(code: str) -> dict:
33
  """
34
- Simple heuristic to detect programming language.
35
  """
36
  code = code.strip()
37
- if "#include" in code or "std::" in code or "int main()" in code:
 
 
38
  return {"name": "C++", "ext": "cpp"}
39
- if "public class" in code or "System.out.println" in code:
40
- return {"name": "Java", "ext": "java"}
41
- if "const " in code or "let " in code or "console.log" in code or "function" in code:
42
- return {"name": "JavaScript", "ext": "js"}
43
- if "def " in code or "import " in code or "print(" in code:
 
 
 
 
 
 
 
44
  return {"name": "Python", "ext": "py"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  return {"name": "Text", "ext": "txt"}
46
 
47
  def correct_code_with_ai(code: str) -> dict:
48
  """
49
- Takes a buggy code snippet and returns a corrected version using the Qwen model pipeline.
50
  """
51
  detected_lang = detect_language(code)
52
 
53
- if not pipe:
54
  return {
55
  "code": "# Model failed to load. Check server logs.",
56
  "language": detected_lang
57
  }
58
 
 
59
  system_prompt = (
60
- "You are 'Clarity', an intelligent code correction and refactoring engine. "
61
- f"Your goal is to take buggy or suboptimal {detected_lang['name']} code and provide a clean, "
62
- "production-ready version. \n\n"
63
- "Tasks:\n"
64
- "1. Fix all syntax and logical bugs.\n"
65
- "2. Improve code structure and readability (refactoring).\n"
66
- "3. Enforce industry-standard naming conventions.\n"
67
- "4. Maintain the original intent and logic of the code.\n\n"
68
- "Constraint: Return ONLY the corrected code. No explanations, no markdown backticks, no comments unless necessary for clarity."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  )
70
 
71
  messages = [
72
  {"role": "system", "content": system_prompt},
 
 
73
  {"role": "user", "content": code}
74
  ]
75
 
76
  try:
77
- # Standard pipeline call
78
- outputs = pipe(
79
- messages,
80
- max_new_tokens=1024,
81
- return_full_text=False
82
  )
83
 
84
  # Extract content
85
- generated_msg = outputs[0]["generated_text"]
86
-
87
- if isinstance(generated_msg, list):
88
- response_content = generated_msg[-1]["content"]
89
- else:
90
- response_content = str(generated_msg)
91
 
92
- # Clean up
93
  cleaned_response = response_content.strip()
 
 
94
  if "```" in cleaned_response:
95
  lines = cleaned_response.split("\n")
96
- if lines[0].startswith("```"): lines = lines[1:]
 
 
97
  if lines and lines[-1].strip().startswith("```"): lines = lines[:-1]
 
 
 
98
  cleaned_response = "\n".join(lines).strip()
99
 
 
 
 
100
  return {
101
  "code": cleaned_response,
102
  "language": detected_lang
@@ -107,4 +202,4 @@ def correct_code_with_ai(code: str) -> dict:
107
  return {
108
  "code": f"# An error occurred during processing: {str(e)}",
109
  "language": detected_lang
110
- }
 
1
  import os
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
 
5
  # --- Configuration ---
6
+ # Using the 4-bit quantized version of Qwen 2.5 Coder 7B
7
+ # This fits comfortably in 16GB RAM (~5-6GB usage) and is much faster on CPU
8
+ REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
9
+ FILENAME = "qwen2.5-coder-7b-instruct-q4_k_m.gguf"
10
 
11
+ print(f"Initializing Clarity AI Engine (llama.cpp)...")
12
+ print(f"Target Model: {REPO_ID} [{FILENAME}]")
13
 
14
+ llm = None
 
 
 
15
 
16
  try:
17
+ print("Downloading/Loading model...")
18
+ model_path = hf_hub_download(
19
+ repo_id=REPO_ID,
20
+ filename=FILENAME,
21
+ # This caches the model in ~/.cache/huggingface/hub
22
+ )
23
+
24
+ # Initialize Llama
25
+ # Use environment variable to toggle context size (8192 for HF Spaces, 4096 for local)
26
+ ctx_size = int(os.getenv("MODEL_CTX_SIZE", "4096"))
27
+ llm = Llama(
28
+ model_path=model_path,
29
+ n_ctx=ctx_size,
30
+ n_batch=512,
31
+ n_threads=os.cpu_count(),
32
+ verbose=False
33
  )
34
  print("Success: Clarity AI Model loaded.")
35
 
36
  except Exception as e:
37
  print(f"CRITICAL ERROR: Failed to load model. {e}")
38
+ llm = None
39
 
40
  def detect_language(code: str) -> dict:
41
  """
42
+ Heuristic detection for LeetCode-supported languages.
43
  """
44
  code = code.strip()
45
+
46
+ # C / C++
47
+ if "#include" in code or "using namespace std" in code or "std::" in code:
48
  return {"name": "C++", "ext": "cpp"}
49
+ if "printf" in code and "#include <stdio.h>" in code:
50
+ return {"name": "C", "ext": "c"}
51
+
52
+ # Java / C#
53
+ if "public class" in code:
54
+ if "System.out.println" in code or "public static void main" in code:
55
+ return {"name": "Java", "ext": "java"}
56
+ if "Console.WriteLine" in code or "namespace " in code or "using System" in code:
57
+ return {"name": "C#", "ext": "cs"}
58
+
59
+ # Python
60
+ if "def " in code and ":" in code:
61
  return {"name": "Python", "ext": "py"}
62
+
63
+ # JS / TS
64
+ if "console.log" in code or "const " in code or "let " in code or "function" in code:
65
+ if ": number" in code or ": string" in code or "interface " in code:
66
+ return {"name": "TypeScript", "ext": "ts"}
67
+ return {"name": "JavaScript", "ext": "js"}
68
+
69
+ # Go
70
+ if "package main" in code or "func main" in code or "fmt.Print" in code:
71
+ return {"name": "Go", "ext": "go"}
72
+
73
+ # Rust
74
+ if "fn " in code and ("let mut" in code or "println!" in code or "Vec<" in code):
75
+ return {"name": "Rust", "ext": "rs"}
76
+
77
+ # PHP
78
+ if "<?php" in code or "$" in code and "echo" in code:
79
+ return {"name": "PHP", "ext": "php"}
80
+
81
+ # Ruby
82
+ if "def " in code and "end" in code and "puts" in code:
83
+ return {"name": "Ruby", "ext": "rb"}
84
+
85
+ # Swift
86
+ if "func " in code and ("var " in code or "let " in code) and "print(" in code:
87
+ if "->" in code: # Swift return type arrow
88
+ return {"name": "Swift", "ext": "swift"}
89
+
90
+ # Kotlin
91
+ if "fun " in code and ("val " in code or "var " in code) and "println(" in code:
92
+ return {"name": "Kotlin", "ext": "kt"}
93
+
94
+ # Dart
95
+ if "void main()" in code and "print(" in code and ";" in code:
96
+ return {"name": "Dart", "ext": "dart"}
97
+
98
+ # Scala
99
+ if "object " in code or "def main" in code or "val " in code and "println" in code:
100
+ return {"name": "Scala", "ext": "scala"}
101
+
102
+ # Elixir
103
+ if "defmodule" in code or "defp" in code or "IO.puts" in code or ":ok" in code:
104
+ return {"name": "Elixir", "ext": "ex"}
105
+
106
+ # Erlang
107
+ if "-module" in code or "-export" in code or "io:format" in code:
108
+ return {"name": "Erlang", "ext": "erl"}
109
+
110
+ # Racket / Lisp
111
+ if "(define" in code or "(lambda" in code or "#lang racket" in code:
112
+ return {"name": "Racket", "ext": "rkt"}
113
+
114
+ # Fallback
115
  return {"name": "Text", "ext": "txt"}
116
 
117
  def correct_code_with_ai(code: str) -> dict:
118
  """
119
+ Takes a buggy code snippet and returns a corrected version using the Qwen model.
120
  """
121
  detected_lang = detect_language(code)
122
 
123
+ if not llm:
124
  return {
125
  "code": "# Model failed to load. Check server logs.",
126
  "language": detected_lang
127
  }
128
 
129
+ # Stricter System Prompt with Educational Persona
130
  system_prompt = (
131
+ "You are Clarity, an intelligent coding assistant designed for students and junior developers. "
132
+ "You were created by a team of college students (see projects.md) for a minor project to help peers write better code.\n\n"
133
+ "Your Mission:\n"
134
+ "1. **Review & Fix:** Correct syntax and logical errors.\n"
135
+ "2. **Educate:** Improve variable naming (use industry standards like Google Style Guide), readability, and structure.\n"
136
+ "3. **Optimize:** Remove redundancy and improve logic.\n"
137
+ "4. **Be Concise:** Provide objective, short, and high-value feedback. Avoid long lectures.\n\n"
138
+ "Guidelines:\n"
139
+ "- **Style:** Follow the Google Style Guide for the respective language.\n"
140
+ "- **Comments:** Add comments ONLY for complex logic or educational 'aha!' moments.\n"
141
+ "- **Tone:** Concise, Objective, and Mentor-like.\n"
142
+ "- **Identity:** You are 'Clarity'. If asked about your version, refer users to the GitHub repo. If asked non-code questions, answer only if factual and harmless; otherwise, politely decline.\n\n"
143
+ "Constraint: Return ONLY the corrected code with necessary educational comments inline. Do not output a separate explanation block unless absolutely necessary for a critical concept."
144
+ )
145
+
146
+ # One-shot example to force the pattern (Input -> Code Only)
147
+ example_input = "def sum(a,b): return a+b" if detected_lang["name"] == "Python" else "int sum(int a, int b) { return a+b; }"
148
+ example_output = (
149
+ "def sum(operand_a, operand_b):\n"
150
+ " # Descriptive names improve readability\n"
151
+ " return operand_a + operand_b"
152
+ ) if detected_lang["name"] == "Python" else (
153
+ "int sum(int operand_a, int operand_b) {\n"
154
+ " // Descriptive names improve readability\n"
155
+ " return operand_a + operand_b;\n"
156
+ "}"
157
  )
158
 
159
  messages = [
160
  {"role": "system", "content": system_prompt},
161
+ {"role": "user", "content": example_input},
162
+ {"role": "assistant", "content": example_output},
163
  {"role": "user", "content": code}
164
  ]
165
 
166
  try:
167
+ # llama-cpp-python chat completion
168
+ response = llm.create_chat_completion(
169
+ messages=messages,
170
+ max_tokens=2048,
171
+ temperature=0.1, # Lower temperature for stricter adherence
172
  )
173
 
174
  # Extract content
175
+ response_content = response["choices"][0]["message"]["content"]
 
 
 
 
 
176
 
177
+ # Clean up (double check for markdown or chatty intros)
178
  cleaned_response = response_content.strip()
179
+
180
+ # Aggressive stripping of "Here is the code..." or markdown
181
  if "```" in cleaned_response:
182
  lines = cleaned_response.split("\n")
183
+ # Remove starting markdown
184
+ if lines[0].strip().startswith("```"): lines = lines[1:]
185
+ # Remove ending markdown
186
  if lines and lines[-1].strip().startswith("```"): lines = lines[:-1]
187
+ # Remove common chatty prefixes if they slipped through
188
+ if lines and (lines[0].lower().startswith("here is") or lines[0].lower().startswith("sure")):
189
+ lines = lines[1:]
190
  cleaned_response = "\n".join(lines).strip()
191
 
192
+ # Run detection on the CLEAN, CORRECTED code for maximum accuracy
193
+ detected_lang = detect_language(cleaned_response)
194
+
195
  return {
196
  "code": cleaned_response,
197
  "language": detected_lang
 
202
  return {
203
  "code": f"# An error occurred during processing: {str(e)}",
204
  "language": detected_lang
205
+ }
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  fastapi
2
  uvicorn
3
- transformers
4
- torch
5
- accelerate
 
1
  fastapi
2
  uvicorn
3
+ llama-cpp-python
4
+ huggingface-hub