Spaces:

arhanv
/

drum-kit-generator

Sleeping

App Files Files Community

arhanv commited on Mar 15

Commit

338e293

1 Parent(s): ac3dd61

ported fx generation

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +11 -1
fx.py +123 -0
inference.py +6 -1
requirements.txt +3 -0

.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@
 /dataset/unzipped
 /dataset/all_sounds
 *.pyc

 /dataset/unzipped
 /dataset/all_sounds
 *.pyc
+/dataset/processed_audio
+*_concat*.wav

app.py CHANGED Viewed

@@ -5,17 +5,27 @@ import soundfile as sf
 import numpy as np
 from inference import generate_drum_kit
 from audio_utils import play_audio
 # Streamlit UI
 st.title("Generate Drum Kits with Text")
 # User Inputs
-prompt = st.text_input("Describe your drum kit (e.g., 'warm vintage')", "8-bit video game")
 kit_size = st.slider("Number of sounds per instrument:", 1, 10, 4)
 # Run the inference
 if st.button("Generate Drum Kit"):
     drum_kit = generate_drum_kit(prompt, kit_size)
     st.session_state["drum_kit"] = drum_kit  # Store results
 # Display results

 import numpy as np
 from inference import generate_drum_kit
 from audio_utils import play_audio
+from fx import get_fx
 # Streamlit UI
 st.title("Generate Drum Kits with Text")
 # User Inputs
+prompt = st.text_input("Describe your drum kit:", "warm vintage acoustic")
 kit_size = st.slider("Number of sounds per instrument:", 1, 10, 4)
+use_fx = st.toggle("Apply audio effects?")
+if use_fx:
+    if st.toggle("Use a different prompt for audio effects?"):
+        fx_prompt = st.text_input("Describe your desired FX tone:", "soft and ethereal telephone")
+    else:
+        fx_prompt = prompt
 # Run the inference
 if st.button("Generate Drum Kit"):
     drum_kit = generate_drum_kit(prompt, kit_size)
+    if use_fx:
+        drum_kit, fx_params = get_fx(drum_kit, fx_prompt)
+        st.write(fx_params)
     st.session_state["drum_kit"] = drum_kit  # Store results
 # Display results

fx.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from inference import get_clap_embeddings_from_audio, get_clap_embeddings_from_text
+from pedalboard import Pedalboard, Reverb, HighpassFilter, LowpassFilter, Distortion, Bitcrush
+from sklearn.metrics.pairwise import cosine_similarity
+import soundfile as sf
+from skopt import gp_minimize
+from skopt.space import Real
+import librosa
+import numpy as np
+import os
+def concatenate_sounds(drum_kit, output_path="temp_concat.wav"):
+    """Stitch together all drum sounds into one audio file."""
+    all_audio = []
+    sr = 48000
+    for instrument, samples in drum_kit.items():
+        for sample in samples:
+            audio, _ = librosa.load(sample, sr=48000)
+            all_audio.append(audio)
+    # Concatenate all sounds with a small silence gap
+    gap = np.zeros(int(sr * 0.2))  # 200ms silence between sounds
+    full_audio = np.concatenate([item for audio in all_audio for item in (audio, gap)])
+    # Save to temp file
+    sf.write(output_path, full_audio, sr)
+    return output_path
+def evaluate_fitness(audio_path, text_embed):
+    """Compute similarity between processed audio and text query."""
+    audio_embed = get_clap_embeddings_from_audio(audio_path)
+    return cosine_similarity([text_embed], [audio_embed])[0][0]
+def apply_fx(audio_path, params, write_wav=True, output_dir="processed_audio"):
+    """Apply EQ and Reverb to an audio file and return the modified file path."""
+    audio, sr = librosa.load(audio_path, sr=48000)
+    board = Pedalboard([
+        LowpassFilter(cutoff_frequency_hz=params['lowpass']),
+        HighpassFilter(cutoff_frequency_hz=params['highpass']),
+        Distortion(drive_db=params['drive_db']),
+        Bitcrush(bit_depth=params['bit_depth']),
+        Reverb(room_size=params['reverb_size'], wet_level=params['reverb_wet'])
+    ])
+    processed_audio = board(audio, sr)
+    if write_wav:
+        # Determine output directory dynamically
+        base_dir = os.path.dirname(os.path.dirname(audio_path))  # Get 'dataset' level
+        output_dir = os.path.join(base_dir, output_dir)
+        # Ensure the output directory exists
+        os.makedirs(output_dir, exist_ok=True)
+        # Create new file path inside the processed_sounds directory
+        file_name = os.path.basename(audio_path).replace(".wav", "_processed.wav")
+        output_path = os.path.join(output_dir, file_name)
+        # Save processed audio
+        sf.write(output_path, processed_audio, sr)
+        return output_path
+    else:
+        return processed_audio
+def objective_function(params, audio_file, text_embedding):
+    """Objective function for Bayesian Optimization using the concatenated file."""
+    processed_audio = apply_fx(audio_file, {
+        "lowpass": params[0],
+        "highpass": params[1],
+        "reverb_size": params[2],
+        "reverb_wet": params[3],
+        "drive_db": params[4],
+        "bit_depth": params[5]
+    }, write_wav=True)
+    similarity = evaluate_fitness(processed_audio, text_embedding)
+    return -similarity  # Minimize negative similarity (maximize similarity)
+def get_params_dict(params_list):
+    return {
+        "lowpass cutoff (Hz)": params_list[0],
+        "highpass cutoff (Hz)": params_list[1],
+        "reverb size": params_list[2],
+        "reverb mix": params_list[3],
+        "distortion - gain_db": params_list[4],
+        "bitcrush - bit depth": params_list[5]
+    }
+# Define parameter search space
+search_space = [
+    Real(5000, 15000, name="lowpass"),
+    Real(50, 1000, name="highpass"),
+    Real(0.0, 0.8, name="reverb_size"),
+    Real(0.0, 0.8, name="reverb_wet"),
+    Real(0.0, 20.0, name="drive_db"),
+    Real(6.0, 32.0, name="bit_depth")
+]
+##### Main function #####
+def get_fx(drum_kit, fx_prompt):
+    """Optimize FX settings for the entire drum kit by using a concatenated audio file."""
+    text_embedding = get_clap_embeddings_from_text(fx_prompt)
+    # Concatenate all drum sounds
+    concat_file = concatenate_sounds(drum_kit)
+    # Define the objective function for the concatenated file
+    def obj_func(params):
+        return objective_function(params, concat_file, text_embedding)
+    # Run Bayesian optimization
+    res = gp_minimize(obj_func, search_space, n_calls=30, random_state=42)
+    best_params = res.x
+    # Apply the best FX parameters to each individual sound
+    optimized_kit = {}
+    for instrument, samples in drum_kit.items():
+        optimized_kit[instrument] = [apply_fx(sample, {
+            "lowpass": best_params[0],
+            "highpass": best_params[1],
+            "reverb_size": best_params[2],
+            "reverb_wet": best_params[3],
+            "drive_db": best_params[4],
+            "bit_depth": best_params[5]
+        }, write_wav=True) for sample in samples]
+    return optimized_kit, get_params_dict(best_params)

inference.py CHANGED Viewed

@@ -8,7 +8,6 @@ import zipfile
 import json
 from transformers import ClapModel, ClapProcessor
 import torch
-import shutil
 dataset_zip = "dataset/all_sounds.zip"
 extracted_folder = "dataset/all_sounds"
@@ -65,6 +64,12 @@ def get_clap_embeddings_from_text(text):
         text_embeddings = model.get_text_features(**inputs)
     return text_embeddings.squeeze(0).numpy()
 def find_top_sounds(text_embed, instrument, top_N=4):
     """Finds the closest N sounds for an instrument."""
     valid_sounds = metadata[metadata["Instrument"] == instrument].index.tolist()

 import json
 from transformers import ClapModel, ClapProcessor
 import torch
 dataset_zip = "dataset/all_sounds.zip"
 extracted_folder = "dataset/all_sounds"
         text_embeddings = model.get_text_features(**inputs)
     return text_embeddings.squeeze(0).numpy()
+def get_clap_embeddings_from_audio(audio_path):
+    audio, sr = librosa.load(audio_path)
+    inputs = processor(audios=[audio], return_tensors="pt", sampling_rate=48000)
+    with torch.no_grad():
+        return model.get_audio_features(**inputs).squeeze(0).numpy()
 def find_top_sounds(text_embed, instrument, top_N=4):
     """Finds the closest N sounds for an instrument."""
     valid_sounds = metadata[metadata["Instrument"] == instrument].index.tolist()

requirements.txt CHANGED Viewed

@@ -30,10 +30,12 @@ numba==0.61.0
 numpy==2.1.3
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
 platformdirs==4.3.6
 pooch==1.8.2
 protobuf==5.29.3
 pyarrow==19.0.1
 pycparser==2.22
 pydeck==0.9.1
@@ -46,6 +48,7 @@ requests==2.32.3
 rpds-py==0.23.1
 safetensors==0.5.3
 scikit-learn==1.6.1
 scipy==1.15.2
 six==1.17.0
 smmap==5.0.2

 numpy==2.1.3
 packaging==24.2
 pandas==2.2.3
+pedalboard==0.9.16
 pillow==11.1.0
 platformdirs==4.3.6
 pooch==1.8.2
 protobuf==5.29.3
+pyaml==25.1.0
 pyarrow==19.0.1
 pycparser==2.22
 pydeck==0.9.1
 rpds-py==0.23.1
 safetensors==0.5.3
 scikit-learn==1.6.1
+scikit-optimize==0.10.2
 scipy==1.15.2
 six==1.17.0
 smmap==5.0.2