zhimin-z commited on
Commit
91984f1
·
1 Parent(s): 9a3c3b5
Files changed (1) hide show
  1. msr.py +22 -12
msr.py CHANGED
@@ -54,13 +54,8 @@ PATCH_WANTED_LABELS = [
54
  # Git sync configuration (mandatory to get latest bot data)
55
  GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
56
 
57
- # OPTIMIZED DUCKDB CONFIGURATION
58
- DUCKDB_THREADS = 16
59
- DUCKDB_MEMORY_LIMIT = "64GB"
60
-
61
  # Streaming batch configuration
62
- BATCH_SIZE_DAYS = 7 # Process 1 week at a time (~168 hourly files)
63
- # At this size: ~7 days × 24 files × ~100MB per file = ~16GB uncompressed per batch
64
 
65
  # Download configuration
66
  DOWNLOAD_WORKERS = 4
@@ -320,13 +315,28 @@ def get_duckdb_connection():
320
  # Re-raise if it's not a locking error
321
  raise
322
 
323
- # OPTIMIZED SETTINGS
324
- conn.execute(f"SET threads TO {DUCKDB_THREADS};")
325
- conn.execute("SET preserve_insertion_order = false;")
326
- conn.execute("SET enable_object_cache = true;")
327
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
328
- conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';") # Per-query limit
329
- conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';") # Hard cap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  return conn
332
 
 
54
  # Git sync configuration (mandatory to get latest bot data)
55
  GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
56
 
 
 
 
 
57
  # Streaming batch configuration
58
+ BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
 
59
 
60
  # Download configuration
61
  DOWNLOAD_WORKERS = 4
 
315
  # Re-raise if it's not a locking error
316
  raise
317
 
318
+ # CORE MEMORY & THREADING SETTINGS
319
+ conn.execute(f"SET threads TO 8;")
320
+ conn.execute(f"SET max_memory = '48GB';") # Hard cap
 
321
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
322
+
323
+ # JSON STREAMING OPTIMIZATIONS (critical for performance)
324
+ conn.execute("SET json.read_objects = true;") # Enable streaming JSON objects
325
+ conn.execute("SET json.read_buffer_size = '64MB';") # Increase from 256KB default for large fields
326
+ conn.execute("SET json.format = 'newline_delimited';") # Skip array parsing, double throughput
327
+
328
+ # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
329
+ try:
330
+ conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
331
+ conn.execute("INSTALL 'gzip';")
332
+ conn.execute("LOAD 'gzip';")
333
+ except Exception as e:
334
+ print(f" ⚠ Warning: Could not load gzip extension: {e}")
335
+
336
+ # PERFORMANCE OPTIMIZATIONS
337
+ conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
338
+ conn.execute("SET default_order = 'ORDER BY NONE';") # Skip unnecessary sorting
339
+ conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
340
 
341
  return conn
342