likhonsheikhdev commited on
Commit
ab0cf4f
·
verified ·
1 Parent(s): 09b5534

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +19 -4
  2. README.md +14 -14
  3. main.py +77 -53
  4. requirements.txt +3 -1
Dockerfile CHANGED
@@ -2,14 +2,25 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
5
  # Install system dependencies
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
  build-essential \
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
- # Copy requirements and install Python dependencies
 
 
 
11
  COPY requirements.txt .
12
- RUN pip install --no-cache-dir -r requirements.txt
 
 
13
 
14
  # Copy application code
15
  COPY . .
@@ -18,8 +29,12 @@ COPY . .
18
  RUN useradd -m -u 1000 user
19
  USER user
20
 
 
 
 
 
21
  # Expose port
22
  EXPOSE 7860
23
 
24
- # Run the application
25
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # Set environment variables for CPU optimization
6
+ ENV OMP_NUM_THREADS=2
7
+ ENV MKL_NUM_THREADS=2
8
+ ENV TOKENIZERS_PARALLELISM=true
9
+ ENV TRANSFORMERS_OFFLINE=0
10
+
11
  # Install system dependencies
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
13
  build-essential \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
+ # Install PyTorch CPU version first
17
+ RUN pip install --no-cache-dir torch==2.4.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu
18
+
19
+ # Copy and install other requirements
20
  COPY requirements.txt .
21
+ RUN pip install --no-cache-dir fastapi==0.115.0 uvicorn[standard]==0.30.6 \
22
+ transformers==4.45.0 pydantic==2.9.2 huggingface-hub==0.25.1 \
23
+ optimum==1.23.0 onnxruntime==1.19.0
24
 
25
  # Copy application code
26
  COPY . .
 
29
  RUN useradd -m -u 1000 user
30
  USER user
31
 
32
+ # Pre-download models during build for faster startup
33
+ RUN python -c "from transformers import pipeline; pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')" || true
34
+ RUN python -c "from transformers import pipeline; pipeline('text-generation', model='distilgpt2')" || true
35
+
36
  # Expose port
37
  EXPOSE 7860
38
 
39
+ # Run with optimized settings
40
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
README.md CHANGED
@@ -5,12 +5,16 @@ colorFrom: blue
5
  colorTo: purple
6
  sdk: docker
7
  app_port: 7860
 
8
  pinned: false
9
  ---
10
 
11
  # Docker Model Runner
12
 
13
- A Docker-based Hugging Face Space with named API endpoints for model inference.
 
 
 
14
 
15
  ## Endpoints
16
 
@@ -19,27 +23,23 @@ A Docker-based Hugging Face Space with named API endpoints for model inference.
19
  | `/` | GET | Welcome message |
20
  | `/health` | GET | Health check |
21
  | `/info` | GET | Model information |
22
- | `/predict` | POST | Run model prediction |
23
  | `/generate` | POST | Text generation |
24
- | `/embed` | POST | Get text embeddings |
25
 
26
  ## Usage
27
 
28
- ### Health Check
29
  ```bash
30
- curl https://YOUR-SPACE.hf.space/health
31
- ```
32
 
33
- ### Prediction
34
- ```bash
35
- curl -X POST https://YOUR-SPACE.hf.space/predict \
36
  -H "Content-Type: application/json" \
37
- -d '{"text": "Hello, world!"}'
38
- ```
39
 
40
- ### Text Generation
41
- ```bash
42
- curl -X POST https://YOUR-SPACE.hf.space/generate \
43
  -H "Content-Type: application/json" \
44
  -d '{"prompt": "Once upon a time", "max_length": 50}'
45
  ```
 
5
  colorTo: purple
6
  sdk: docker
7
  app_port: 7860
8
+ suggested_hardware: cpu-basic
9
  pinned: false
10
  ---
11
 
12
  # Docker Model Runner
13
 
14
+ A CPU-optimized Docker Space with named API endpoints for model inference.
15
+
16
+ ## Hardware
17
+ - **CPU Basic**: 2 vCPU · 16 GB RAM
18
 
19
  ## Endpoints
20
 
 
23
  | `/` | GET | Welcome message |
24
  | `/health` | GET | Health check |
25
  | `/info` | GET | Model information |
26
+ | `/predict` | POST | Text classification |
27
  | `/generate` | POST | Text generation |
28
+ | `/embed` | POST | Text embeddings |
29
 
30
  ## Usage
31
 
 
32
  ```bash
33
+ # Health Check
34
+ curl https://likhonsheikhdev-docker-model-runner.hf.space/health
35
 
36
+ # Prediction
37
+ curl -X POST https://likhonsheikhdev-docker-model-runner.hf.space/predict \
 
38
  -H "Content-Type: application/json" \
39
+ -d '{"text": "I love this product!"}'
 
40
 
41
+ # Text Generation
42
+ curl -X POST https://likhonsheikhdev-docker-model-runner.hf.space/generate \
 
43
  -H "Content-Type: application/json" \
44
  -d '{"prompt": "Once upon a time", "max_length": 50}'
45
  ```
main.py CHANGED
@@ -1,5 +1,6 @@
1
  """
2
- Docker Model Runner - FastAPI application with named endpoints
 
3
  """
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
@@ -8,45 +9,61 @@ import torch
8
  from transformers import pipeline, AutoTokenizer, AutoModel
9
  import os
10
  from datetime import datetime
 
11
 
12
- app = FastAPI(
13
- title="Docker Model Runner",
14
- description="HuggingFace Space with named endpoints for model inference",
15
- version="1.0.0"
16
- )
 
 
17
 
18
- # Model configurations
19
- MODEL_NAME = os.getenv("MODEL_NAME", "distilbert-base-uncased")
20
- GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "gpt2")
21
 
22
- # Lazy-loaded pipelines
23
- _classifier = None
24
- _generator = None
25
- _embedder = None
26
 
 
 
 
 
27
 
28
- def get_classifier():
29
- global _classifier
30
- if _classifier is None:
31
- _classifier = pipeline("text-classification", model=MODEL_NAME)
32
- return _classifier
 
 
 
 
 
 
 
 
 
33
 
 
 
 
 
34
 
35
- def get_generator():
36
- global _generator
37
- if _generator is None:
38
- _generator = pipeline("text-generation", model=GENERATOR_MODEL)
39
- return _generator
40
 
41
 
42
- def get_embedder():
43
- global _embedder
44
- if _embedder is None:
45
- _embedder = {
46
- "tokenizer": AutoTokenizer.from_pretrained(MODEL_NAME),
47
- "model": AutoModel.from_pretrained(MODEL_NAME)
48
- }
49
- return _embedder
 
 
 
 
 
50
 
51
 
52
  # Request/Response Models
@@ -65,7 +82,7 @@ class GenerateRequest(BaseModel):
65
  prompt: str
66
  max_length: Optional[int] = 50
67
  num_return_sequences: Optional[int] = 1
68
- temperature: Optional[float] = 1.0
69
 
70
 
71
  class GenerateResponse(BaseModel):
@@ -88,12 +105,14 @@ class EmbedResponse(BaseModel):
88
  class HealthResponse(BaseModel):
89
  status: str
90
  timestamp: str
91
- gpu_available: bool
 
92
 
93
 
94
  class InfoResponse(BaseModel):
95
  name: str
96
  version: str
 
97
  models: dict
98
  endpoints: List[str]
99
 
@@ -103,7 +122,8 @@ class InfoResponse(BaseModel):
103
  async def root():
104
  """Welcome endpoint"""
105
  return {
106
- "message": "Docker Model Runner API",
 
107
  "docs": "/docs",
108
  "endpoints": ["/health", "/info", "/predict", "/generate", "/embed"]
109
  }
@@ -115,7 +135,8 @@ async def health():
115
  return HealthResponse(
116
  status="healthy",
117
  timestamp=datetime.utcnow().isoformat(),
118
- gpu_available=torch.cuda.is_available()
 
119
  )
120
 
121
 
@@ -125,10 +146,11 @@ async def info():
125
  return InfoResponse(
126
  name="Docker Model Runner",
127
  version="1.0.0",
 
128
  models={
129
  "classifier": MODEL_NAME,
130
  "generator": GENERATOR_MODEL,
131
- "embedder": MODEL_NAME
132
  },
133
  endpoints=["/", "/health", "/info", "/predict", "/generate", "/embed"]
134
  )
@@ -137,15 +159,14 @@ async def info():
137
  @app.post("/predict", response_model=PredictResponse)
138
  async def predict(request: PredictRequest):
139
  """
140
- Run text classification prediction
141
 
142
  - **text**: Input text to classify
143
  - **top_k**: Number of top predictions to return
144
  """
145
  try:
146
  start_time = datetime.now()
147
- classifier = get_classifier()
148
- results = classifier(request.text, top_k=request.top_k)
149
  latency = (datetime.now() - start_time).total_seconds() * 1000
150
 
151
  return PredictResponse(
@@ -163,19 +184,18 @@ async def generate(request: GenerateRequest):
163
  Generate text from a prompt
164
 
165
  - **prompt**: Input prompt for generation
166
- - **max_length**: Maximum length of generated text
167
- - **num_return_sequences**: Number of sequences to generate
168
- - **temperature**: Sampling temperature
169
  """
170
  try:
171
  start_time = datetime.now()
172
- generator = get_generator()
173
- results = generator(
174
  request.prompt,
175
  max_length=request.max_length,
176
  num_return_sequences=request.num_return_sequences,
177
  temperature=request.temperature,
178
- do_sample=True
 
179
  )
180
  latency = (datetime.now() - start_time).total_seconds() * 1000
181
 
@@ -193,32 +213,36 @@ async def generate(request: GenerateRequest):
193
  @app.post("/embed", response_model=EmbedResponse)
194
  async def embed(request: EmbedRequest):
195
  """
196
- Get text embeddings
197
 
198
  - **texts**: List of texts to embed
199
  """
200
  try:
201
  start_time = datetime.now()
202
- embedder = get_embedder()
203
 
204
- # Tokenize and get embeddings
205
- inputs = embedder["tokenizer"](
206
  request.texts,
207
  padding=True,
208
  truncation=True,
 
209
  return_tensors="pt"
210
  )
211
 
 
212
  with torch.no_grad():
213
- outputs = embedder["model"](**inputs)
214
- # Use mean pooling
215
- embeddings = outputs.last_hidden_state.mean(dim=1)
 
 
 
216
 
217
  latency = (datetime.now() - start_time).total_seconds() * 1000
218
 
219
  return EmbedResponse(
220
  embeddings=embeddings.tolist(),
221
- model=MODEL_NAME,
222
  dimensions=embeddings.shape[1],
223
  latency_ms=round(latency, 2)
224
  )
 
1
  """
2
+ Docker Model Runner - CPU-Optimized FastAPI application
3
+ Optimized for: 2 vCPU, 16GB RAM
4
  """
5
  from fastapi import FastAPI, HTTPException
6
  from pydantic import BaseModel
 
9
  from transformers import pipeline, AutoTokenizer, AutoModel
10
  import os
11
  from datetime import datetime
12
+ from contextlib import asynccontextmanager
13
 
14
+ # CPU-optimized lightweight models
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "distilbert-base-uncased-finetuned-sst-2-english")
16
+ GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "distilgpt2")
17
+ EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
18
+
19
+ # Set CPU threading
20
+ torch.set_num_threads(2)
21
 
22
+ # Global model cache
23
+ models = {}
 
24
 
 
 
 
 
25
 
26
+ def load_models():
27
+ """Pre-load models for faster inference"""
28
+ global models
29
+ print("Loading models for CPU inference...")
30
 
31
+ # Use smaller, faster models optimized for CPU
32
+ models["classifier"] = pipeline(
33
+ "text-classification",
34
+ model=MODEL_NAME,
35
+ device=-1, # CPU
36
+ torch_dtype=torch.float32
37
+ )
38
+
39
+ models["generator"] = pipeline(
40
+ "text-generation",
41
+ model=GENERATOR_MODEL,
42
+ device=-1,
43
+ torch_dtype=torch.float32
44
+ )
45
 
46
+ # Lightweight embedding model
47
+ models["tokenizer"] = AutoTokenizer.from_pretrained(EMBED_MODEL)
48
+ models["embedder"] = AutoModel.from_pretrained(EMBED_MODEL)
49
+ models["embedder"].eval()
50
 
51
+ print("✅ All models loaded successfully!")
 
 
 
 
52
 
53
 
54
+ @asynccontextmanager
55
+ async def lifespan(app: FastAPI):
56
+ load_models()
57
+ yield
58
+ models.clear()
59
+
60
+
61
+ app = FastAPI(
62
+ title="Docker Model Runner",
63
+ description="CPU-Optimized HuggingFace Space with named endpoints",
64
+ version="1.0.0",
65
+ lifespan=lifespan
66
+ )
67
 
68
 
69
  # Request/Response Models
 
82
  prompt: str
83
  max_length: Optional[int] = 50
84
  num_return_sequences: Optional[int] = 1
85
+ temperature: Optional[float] = 0.7
86
 
87
 
88
  class GenerateResponse(BaseModel):
 
105
  class HealthResponse(BaseModel):
106
  status: str
107
  timestamp: str
108
+ hardware: str
109
+ models_loaded: bool
110
 
111
 
112
  class InfoResponse(BaseModel):
113
  name: str
114
  version: str
115
+ hardware: str
116
  models: dict
117
  endpoints: List[str]
118
 
 
122
  async def root():
123
  """Welcome endpoint"""
124
  return {
125
+ "message": "Docker Model Runner API (CPU Optimized)",
126
+ "hardware": "CPU Basic: 2 vCPU · 16 GB RAM",
127
  "docs": "/docs",
128
  "endpoints": ["/health", "/info", "/predict", "/generate", "/embed"]
129
  }
 
135
  return HealthResponse(
136
  status="healthy",
137
  timestamp=datetime.utcnow().isoformat(),
138
+ hardware="CPU Basic: 2 vCPU · 16 GB RAM",
139
+ models_loaded=len(models) > 0
140
  )
141
 
142
 
 
146
  return InfoResponse(
147
  name="Docker Model Runner",
148
  version="1.0.0",
149
+ hardware="CPU Basic: 2 vCPU · 16 GB RAM",
150
  models={
151
  "classifier": MODEL_NAME,
152
  "generator": GENERATOR_MODEL,
153
+ "embedder": EMBED_MODEL
154
  },
155
  endpoints=["/", "/health", "/info", "/predict", "/generate", "/embed"]
156
  )
 
159
  @app.post("/predict", response_model=PredictResponse)
160
  async def predict(request: PredictRequest):
161
  """
162
+ Run text classification (sentiment analysis)
163
 
164
  - **text**: Input text to classify
165
  - **top_k**: Number of top predictions to return
166
  """
167
  try:
168
  start_time = datetime.now()
169
+ results = models["classifier"](request.text, top_k=request.top_k)
 
170
  latency = (datetime.now() - start_time).total_seconds() * 1000
171
 
172
  return PredictResponse(
 
184
  Generate text from a prompt
185
 
186
  - **prompt**: Input prompt for generation
187
+ - **max_length**: Maximum length of generated text (default: 50)
188
+ - **temperature**: Sampling temperature (default: 0.7)
 
189
  """
190
  try:
191
  start_time = datetime.now()
192
+ results = models["generator"](
 
193
  request.prompt,
194
  max_length=request.max_length,
195
  num_return_sequences=request.num_return_sequences,
196
  temperature=request.temperature,
197
+ do_sample=True,
198
+ pad_token_id=50256 # GPT2 pad token
199
  )
200
  latency = (datetime.now() - start_time).total_seconds() * 1000
201
 
 
213
  @app.post("/embed", response_model=EmbedResponse)
214
  async def embed(request: EmbedRequest):
215
  """
216
+ Get text embeddings using MiniLM (384 dimensions)
217
 
218
  - **texts**: List of texts to embed
219
  """
220
  try:
221
  start_time = datetime.now()
 
222
 
223
+ # Tokenize
224
+ inputs = models["tokenizer"](
225
  request.texts,
226
  padding=True,
227
  truncation=True,
228
+ max_length=256,
229
  return_tensors="pt"
230
  )
231
 
232
+ # Get embeddings
233
  with torch.no_grad():
234
+ outputs = models["embedder"](**inputs)
235
+ # Mean pooling
236
+ attention_mask = inputs["attention_mask"]
237
+ token_embeddings = outputs.last_hidden_state
238
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
239
+ embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
240
 
241
  latency = (datetime.now() - start_time).total_seconds() * 1000
242
 
243
  return EmbedResponse(
244
  embeddings=embeddings.tolist(),
245
+ model=EMBED_MODEL,
246
  dimensions=embeddings.shape[1],
247
  latency_ms=round(latency, 2)
248
  )
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  fastapi==0.115.0
2
  uvicorn[standard]==0.30.6
3
  transformers==4.45.0
4
- torch==2.4.1
5
  pydantic==2.9.2
6
  huggingface-hub==0.25.1
 
 
 
1
  fastapi==0.115.0
2
  uvicorn[standard]==0.30.6
3
  transformers==4.45.0
4
+ torch==2.4.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu
5
  pydantic==2.9.2
6
  huggingface-hub==0.25.1
7
+ optimum==1.23.0
8
+ onnxruntime==1.19.0