""" ACE-Step 1.5 Custom Edition - Main Application A comprehensive music generation system with three main interfaces: 1. Standard ACE-Step GUI 2. Custom Timeline-based Workflow 3. LoRA Training Studio """ import gradio as gr import torch import numpy as np from pathlib import Path import json from typing import Optional, List, Tuple import spaces from src.ace_step_engine import ACEStepEngine from src.timeline_manager import TimelineManager from src.lora_trainer import LoRATrainer from src.audio_processor import AudioProcessor from src.utils import setup_logging, load_config # Setup logger = setup_logging() config = load_config() # Lazy initialize components (will be initialized on first use) ace_engine = None timeline_manager = None lora_trainer = None audio_processor = None def get_ace_engine(): """Lazy-load ACE-Step engine.""" global ace_engine if ace_engine is None: ace_engine = ACEStepEngine(config) return ace_engine def get_timeline_manager(): """Lazy-load timeline manager.""" global timeline_manager if timeline_manager is None: timeline_manager = TimelineManager(config) return timeline_manager def get_lora_trainer(): """Lazy-load LoRA trainer.""" global lora_trainer if lora_trainer is None: lora_trainer = LoRATrainer(config) return lora_trainer def get_audio_processor(): """Lazy-load audio processor.""" global audio_processor if audio_processor is None: audio_processor = AudioProcessor(config) return audio_processor # ==================== TAB 1: STANDARD ACE-STEP GUI ==================== @spaces.GPU(duration=300) def standard_generate( prompt: str, lyrics: str, duration: int, temperature: float, top_p: float, seed: int, style: str, use_lora: bool, lora_path: Optional[str] = None ) -> Tuple[str, str]: """Standard ACE-Step generation with all original features.""" try: logger.info(f"Standard generation: {prompt[:50]}...") # Get engine instance engine = get_ace_engine() # Generate audio audio_path = engine.generate( prompt=prompt, lyrics=lyrics, duration=duration, temperature=temperature, top_p=top_p, seed=seed, style=style, lora_path=lora_path if use_lora else None ) info = f"✅ Generated {duration}s audio successfully" return audio_path, info except Exception as e: logger.error(f"Standard generation failed: {e}") return None, f"❌ Error: {str(e)}" @spaces.GPU(duration=180) def standard_variation(audio_path: str, variation_strength: float) -> Tuple[str, str]: """Generate variation of existing audio.""" try: result = get_ace_engine().generate_variation(audio_path, variation_strength) return result, "✅ Variation generated" except Exception as e: return None, f"❌ Error: {str(e)}" @spaces.GPU(duration=180) def standard_repaint( audio_path: str, start_time: float, end_time: float, new_prompt: str ) -> Tuple[str, str]: """Repaint specific section of audio.""" try: result = get_ace_engine().repaint(audio_path, start_time, end_time, new_prompt) return result, f"✅ Repainted {start_time}s-{end_time}s" except Exception as e: return None, f"❌ Error: {str(e)}" @spaces.GPU(duration=180) def standard_lyric_edit( audio_path: str, new_lyrics: str ) -> Tuple[str, str]: """Edit lyrics while maintaining music.""" try: result = get_ace_engine().edit_lyrics(audio_path, new_lyrics) return result, "✅ Lyrics edited" except Exception as e: return None, f"❌ Error: {str(e)}" # ==================== TAB 2: CUSTOM TIMELINE WORKFLOW ==================== @spaces.GPU(duration=300) def timeline_generate( prompt: str, lyrics: str, context_length: int, style: str, temperature: float, seed: int, session_state: dict ) -> Tuple[str, str, str, dict]: """ Generate 32-second clip with 2s lead-in, 28s main, 2s lead-out. Blends with previous clips based on context_length. """ try: # Initialize session state if None if session_state is None: session_state = {"timeline_id": None, "total_clips": 0} logger.info(f"Timeline generation with {context_length}s context") # Get managers tm = get_timeline_manager() engine = get_ace_engine() ap = get_audio_processor() # Get context from timeline context_audio = tm.get_context( session_state.get("timeline_id"), context_length ) # Generate 32s clip clip = engine.generate_clip( prompt=prompt, lyrics=lyrics, duration=32, context_audio=context_audio, style=style, temperature=temperature, seed=seed ) # Blend with timeline (2s lead-in and lead-out) blended_clip = ap.blend_clip( clip, tm.get_last_clip(session_state.get("timeline_id")), lead_in=2.0, lead_out=2.0 ) # Add to timeline timeline_id = tm.add_clip( session_state.get("timeline_id"), blended_clip, metadata={ "prompt": prompt, "lyrics": lyrics, "context_length": context_length } ) # Update session session_state["timeline_id"] = timeline_id session_state["total_clips"] = session_state.get("total_clips", 0) + 1 # Get full timeline audio full_audio = tm.export_timeline(timeline_id) # Get timeline visualization timeline_viz = tm.visualize_timeline(timeline_id) info = f"✅ Clip {session_state['total_clips']} added • Total: {tm.get_duration(timeline_id):.1f}s" return blended_clip, full_audio, timeline_viz, session_state, info except Exception as e: logger.error(f"Timeline generation failed: {e}") return None, None, None, session_state, f"❌ Error: {str(e)}" def timeline_extend( prompt: str, lyrics: str, context_length: int, session_state: dict ) -> Tuple[str, str, str, dict]: """Extend current timeline with new generation.""" return timeline_generate( prompt, lyrics, context_length, "auto", 0.7, -1, session_state ) @spaces.GPU(duration=240) def timeline_inpaint( start_time: float, end_time: float, new_prompt: str, session_state: dict ) -> Tuple[str, str, dict]: """Inpaint specific region in timeline.""" try: # Initialize session state if None if session_state is None: session_state = {"timeline_id": None, "total_clips": 0} tm = get_timeline_manager() timeline_id = session_state.get("timeline_id") result = tm.inpaint_region( timeline_id, start_time, end_time, new_prompt ) full_audio = tm.export_timeline(timeline_id) timeline_viz = tm.visualize_timeline(timeline_id) info = f"✅ Inpainted {start_time:.1f}s-{end_time:.1f}s" return full_audio, timeline_viz, session_state, info except Exception as e: return None, None, session_state, f"❌ Error: {str(e)}" def timeline_reset(session_state: dict) -> Tuple[None, None, str, dict]: """Reset timeline to start fresh.""" # Initialize session state if None if session_state is None: session_state = {"timeline_id": None, "total_clips": 0} elif session_state.get("timeline_id"): get_timeline_manager().delete_timeline(session_state["timeline_id"]) session_state = {"timeline_id": None, "total_clips": 0} return None, None, "Timeline cleared", session_state # ==================== TAB 3: LORA TRAINING ==================== def lora_upload_files(files: List[str]) -> str: """Upload and prepare audio files for LoRA training.""" try: prepared_files = get_lora_trainer().prepare_dataset(files) return f"✅ Prepared {len(prepared_files)} files for training" except Exception as e: return f"❌ Error: {str(e)}" @spaces.GPU(duration=300) def lora_train( dataset_path: str, model_name: str, learning_rate: float, batch_size: int, num_epochs: int, rank: int, alpha: int, use_existing_lora: bool, existing_lora_path: Optional[str] = None, progress=gr.Progress() ) -> Tuple[str, str]: """Train LoRA model on uploaded dataset.""" try: logger.info(f"Starting LoRA training: {model_name}") # Initialize or load LoRA if use_existing_lora and existing_lora_path: lora_trainer.load_lora(existing_lora_path) else: lora_trainer.initialize_lora(rank=rank, alpha=alpha) # Train def progress_callback(step, total_steps, loss): progress((step, total_steps), desc=f"Training (loss: {loss:.4f})") result_path = lora_trainer.train( dataset_path=dataset_path, model_name=model_name, learning_rate=learning_rate, batch_size=batch_size, num_epochs=num_epochs, progress_callback=progress_callback ) info = f"✅ Training complete! Model saved to {result_path}" return result_path, info except Exception as e: logger.error(f"LoRA training failed: {e}") return None, f"❌ Error: {str(e)}" def lora_download(lora_path: str) -> str: """Provide LoRA model for download.""" return lora_path if Path(lora_path).exists() else None # ==================== GRADIO UI ==================== def create_ui(): """Create the three-tab Gradio interface.""" with gr.Blocks(title="ACE-Step 1.5 Custom Edition", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🎵 ACE-Step 1.5 Custom Edition **Three powerful interfaces for music generation and training** Models will download automatically on first use (~7GB from HuggingFace) """) with gr.Tabs(): # ============ TAB 1: STANDARD ACE-STEP ============ with gr.Tab("🎼 Standard ACE-Step"): gr.Markdown("### Full-featured standard ACE-Step 1.5 interface") with gr.Row(): with gr.Column(): std_prompt = gr.Textbox( label="Prompt", placeholder="Describe the music style, mood, instruments...", lines=3 ) std_lyrics = gr.Textbox( label="Lyrics (optional)", placeholder="Enter lyrics here...", lines=5 ) with gr.Row(): std_duration = gr.Slider( minimum=10, maximum=240, value=30, step=10, label="Duration (seconds)" ) std_style = gr.Dropdown( choices=["auto", "pop", "rock", "jazz", "classical", "electronic", "hip-hop"], value="auto", label="Style" ) with gr.Row(): std_temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature" ) std_top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P" ) std_seed = gr.Number(label="Seed (-1 for random)", value=-1) with gr.Row(): std_use_lora = gr.Checkbox(label="Use LoRA", value=False) std_lora_path = gr.Textbox( label="LoRA Path", placeholder="Path to LoRA model (if using)" ) std_generate_btn = gr.Button("🎵 Generate", variant="primary", size="lg") with gr.Column(): gr.Markdown("### Audio Input (Optional)") gr.Markdown("*Upload audio file or record to use as style guidance*") std_audio_input = gr.Audio( label="Style Reference Audio", type="filepath" ) gr.Markdown("### Generated Output") std_audio_out = gr.Audio(label="Generated Audio") std_info = gr.Textbox(label="Status", lines=2) gr.Markdown("### Advanced Controls") with gr.Accordion("🔄 Generate Variation", open=False): std_var_strength = gr.Slider(0.1, 1.0, 0.5, label="Variation Strength") std_var_btn = gr.Button("Generate Variation") with gr.Accordion("🎨 Repaint Section", open=False): std_repaint_start = gr.Number(label="Start Time (s)", value=0) std_repaint_end = gr.Number(label="End Time (s)", value=10) std_repaint_prompt = gr.Textbox(label="New Prompt", lines=2) std_repaint_btn = gr.Button("Repaint") with gr.Accordion("✏️ Edit Lyrics", open=False): std_edit_lyrics = gr.Textbox(label="New Lyrics", lines=4) std_edit_btn = gr.Button("Edit Lyrics") # Event handlers std_generate_btn.click( fn=standard_generate, inputs=[std_prompt, std_lyrics, std_duration, std_temperature, std_top_p, std_seed, std_style, std_use_lora, std_lora_path], outputs=[std_audio_out, std_info] ) std_var_btn.click( fn=standard_variation, inputs=[std_audio_out, std_var_strength], outputs=[std_audio_out, std_info] ) std_repaint_btn.click( fn=standard_repaint, inputs=[std_audio_out, std_repaint_start, std_repaint_end, std_repaint_prompt], outputs=[std_audio_out, std_info] ) std_edit_btn.click( fn=standard_lyric_edit, inputs=[std_audio_out, std_edit_lyrics], outputs=[std_audio_out, std_info] ) # ============ TAB 2: CUSTOM TIMELINE ============ with gr.Tab("⏱️ Timeline Workflow"): gr.Markdown(""" ### Custom Timeline-based Generation Generate 32-second clips that seamlessly blend together on a master timeline. """) # Session state for timeline timeline_state = gr.State(value=None) with gr.Row(): with gr.Column(): tl_prompt = gr.Textbox( label="Prompt", placeholder="Describe this section...", lines=3 ) tl_lyrics = gr.Textbox( label="Lyrics for this clip", placeholder="Enter lyrics for this 32s section...", lines=4 ) gr.Markdown("*How far back to reference for style guidance*") tl_context_length = gr.Slider( minimum=0, maximum=120, value=30, step=10, label="Context Length (seconds)" ) with gr.Row(): tl_style = gr.Dropdown( choices=["auto", "pop", "rock", "jazz", "electronic"], value="auto", label="Style" ) tl_temperature = gr.Slider( minimum=0.5, maximum=1.0, value=0.7, step=0.05, label="Temperature" ) tl_seed = gr.Number(label="Seed (-1 for random)", value=-1) with gr.Row(): tl_generate_btn = gr.Button("🎵 Generate Clip", variant="primary", size="lg") tl_extend_btn = gr.Button("➕ Extend", size="lg") tl_reset_btn = gr.Button("🔄 Reset Timeline", variant="secondary") tl_info = gr.Textbox(label="Status", lines=2) with gr.Column(): tl_clip_audio = gr.Audio(label="Latest Clip") tl_full_audio = gr.Audio(label="Full Timeline") tl_timeline_viz = gr.Image(label="Timeline Visualization") with gr.Accordion("🎨 Inpaint Timeline Region", open=False): tl_inpaint_start = gr.Number(label="Start Time (s)", value=0) tl_inpaint_end = gr.Number(label="End Time (s)", value=10) tl_inpaint_prompt = gr.Textbox(label="New Prompt", lines=2) tl_inpaint_btn = gr.Button("Inpaint Region") # Event handlers tl_generate_btn.click( fn=timeline_generate, inputs=[tl_prompt, tl_lyrics, tl_context_length, tl_style, tl_temperature, tl_seed, timeline_state], outputs=[tl_clip_audio, tl_full_audio, tl_timeline_viz, timeline_state, tl_info] ) tl_extend_btn.click( fn=timeline_extend, inputs=[tl_prompt, tl_lyrics, tl_context_length, timeline_state], outputs=[tl_clip_audio, tl_full_audio, tl_timeline_viz, timeline_state, tl_info] ) tl_reset_btn.click( fn=timeline_reset, inputs=[timeline_state], outputs=[tl_clip_audio, tl_full_audio, tl_info, timeline_state] ) tl_inpaint_btn.click( fn=timeline_inpaint, inputs=[tl_inpaint_start, tl_inpaint_end, tl_inpaint_prompt, timeline_state], outputs=[tl_full_audio, tl_timeline_viz, timeline_state, tl_info] ) # ============ TAB 3: LORA TRAINING ============ with gr.Tab("🎓 LoRA Training Studio"): gr.Markdown(""" ### Train Custom LoRA Models Upload audio files to train specialized models for voice cloning, style adaptation, etc. """) with gr.Row(): with gr.Column(): gr.Markdown("#### 1. Upload Training Data") lora_files = gr.File( label="Audio Files", file_count="multiple", file_types=["audio"] ) lora_upload_btn = gr.Button("📤 Upload & Prepare Dataset") lora_upload_status = gr.Textbox(label="Upload Status", lines=2) gr.Markdown("#### 2. Training Configuration") lora_dataset_path = gr.Textbox( label="Dataset Path", placeholder="Path to prepared dataset" ) lora_model_name = gr.Textbox( label="Model Name", placeholder="my_custom_lora" ) with gr.Row(): lora_learning_rate = gr.Number( label="Learning Rate", value=1e-4 ) lora_batch_size = gr.Slider( minimum=1, maximum=16, value=4, step=1, label="Batch Size" ) with gr.Row(): lora_num_epochs = gr.Slider( minimum=1, maximum=100, value=10, step=1, label="Epochs" ) lora_rank = gr.Slider( minimum=4, maximum=128, value=16, step=4, label="LoRA Rank" ) lora_alpha = gr.Slider( minimum=4, maximum=128, value=32, step=4, label="LoRA Alpha" ) lora_use_existing = gr.Checkbox( label="Continue training from existing LoRA", value=False ) lora_existing_path = gr.Textbox( label="Existing LoRA Path", placeholder="Path to existing LoRA model" ) lora_train_btn = gr.Button("🚀 Start Training", variant="primary", size="lg") with gr.Column(): lora_train_status = gr.Textbox(label="Training Status", lines=3) lora_model_path = gr.Textbox(label="Trained Model Path", lines=1) lora_download_btn = gr.Button("💾 Download Model") lora_download_file = gr.File(label="Download") gr.Markdown(""" #### Training Tips - Upload 10+ audio samples for best results - Keep samples consistent in style/quality - Higher rank = more capacity but slower training - Start with 10-20 epochs and adjust - Use existing LoRA to continue training """) # Event handlers lora_upload_btn.click( fn=lora_upload_files, inputs=[lora_files], outputs=[lora_upload_status] ) lora_train_btn.click( fn=lora_train, inputs=[lora_dataset_path, lora_model_name, lora_learning_rate, lora_batch_size, lora_num_epochs, lora_rank, lora_alpha, lora_use_existing, lora_existing_path], outputs=[lora_model_path, lora_train_status] ) lora_download_btn.click( fn=lora_download, inputs=[lora_model_path], outputs=[lora_download_file] ) gr.Markdown(""" --- ### About ACE-Step 1.5 Custom Edition by Gamahea | Based on [ACE-Step](https://ace-step.github.io/) """) return app # ==================== MAIN ==================== if __name__ == "__main__": logger.info("Starting ACE-Step 1.5 Custom Edition...") try: # Create and launch app app = create_ui() # Monkey patch the get_api_info method to prevent JSON schema errors original_get_api_info = app.get_api_info def safe_get_api_info(*args, **kwargs): """Patched get_api_info that returns minimal info to avoid schema errors""" try: return original_get_api_info(*args, **kwargs) except (TypeError, AttributeError, KeyError) as e: logger.warning(f"API info generation failed, returning minimal info: {e}") return { "named_endpoints": {}, "unnamed_endpoints": {} } app.get_api_info = safe_get_api_info logger.info("✓ Patched get_api_info method") # Launch the app app.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True ) except Exception as e: logger.error(f"Failed to launch app: {e}") import traceback traceback.print_exc() raise