zhimin-z
commited on
Commit
·
91984f1
1
Parent(s):
9a3c3b5
fix
Browse files
msr.py
CHANGED
|
@@ -54,13 +54,8 @@ PATCH_WANTED_LABELS = [
|
|
| 54 |
# Git sync configuration (mandatory to get latest bot data)
|
| 55 |
GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
|
| 56 |
|
| 57 |
-
# OPTIMIZED DUCKDB CONFIGURATION
|
| 58 |
-
DUCKDB_THREADS = 16
|
| 59 |
-
DUCKDB_MEMORY_LIMIT = "64GB"
|
| 60 |
-
|
| 61 |
# Streaming batch configuration
|
| 62 |
-
BATCH_SIZE_DAYS =
|
| 63 |
-
# At this size: ~7 days × 24 files × ~100MB per file = ~16GB uncompressed per batch
|
| 64 |
|
| 65 |
# Download configuration
|
| 66 |
DOWNLOAD_WORKERS = 4
|
|
@@ -320,13 +315,28 @@ def get_duckdb_connection():
|
|
| 320 |
# Re-raise if it's not a locking error
|
| 321 |
raise
|
| 322 |
|
| 323 |
-
#
|
| 324 |
-
conn.execute(f"SET threads TO
|
| 325 |
-
conn.execute("SET
|
| 326 |
-
conn.execute("SET enable_object_cache = true;")
|
| 327 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
return conn
|
| 332 |
|
|
|
|
| 54 |
# Git sync configuration (mandatory to get latest bot data)
|
| 55 |
GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# Streaming batch configuration
|
| 58 |
+
BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
|
|
|
|
| 59 |
|
| 60 |
# Download configuration
|
| 61 |
DOWNLOAD_WORKERS = 4
|
|
|
|
| 315 |
# Re-raise if it's not a locking error
|
| 316 |
raise
|
| 317 |
|
| 318 |
+
# CORE MEMORY & THREADING SETTINGS
|
| 319 |
+
conn.execute(f"SET threads TO 8;")
|
| 320 |
+
conn.execute(f"SET max_memory = '48GB';") # Hard cap
|
|
|
|
| 321 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 322 |
+
|
| 323 |
+
# JSON STREAMING OPTIMIZATIONS (critical for performance)
|
| 324 |
+
conn.execute("SET json.read_objects = true;") # Enable streaming JSON objects
|
| 325 |
+
conn.execute("SET json.read_buffer_size = '64MB';") # Increase from 256KB default for large fields
|
| 326 |
+
conn.execute("SET json.format = 'newline_delimited';") # Skip array parsing, double throughput
|
| 327 |
+
|
| 328 |
+
# GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
|
| 329 |
+
try:
|
| 330 |
+
conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
|
| 331 |
+
conn.execute("INSTALL 'gzip';")
|
| 332 |
+
conn.execute("LOAD 'gzip';")
|
| 333 |
+
except Exception as e:
|
| 334 |
+
print(f" ⚠ Warning: Could not load gzip extension: {e}")
|
| 335 |
+
|
| 336 |
+
# PERFORMANCE OPTIMIZATIONS
|
| 337 |
+
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 338 |
+
conn.execute("SET default_order = 'ORDER BY NONE';") # Skip unnecessary sorting
|
| 339 |
+
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
| 340 |
|
| 341 |
return conn
|
| 342 |
|