"""Text-to-Speech engine with timing information."""
import os
import tempfile
from pathlib import Path
from gtts import gTTS
from pydub import AudioSegment
from .config import TTS_LANGUAGE, TTS_SPEED_SLOW, OUTPUT_DIR


def generate_speech(text: str, output_path: str | Path | None = None, language: str = None) -> dict:
    """
    Generate speech audio from text using Google TTS.

    Args:
        text: Text to convert to speech
        output_path: Path to save the audio file (optional)
        language: Language code (default: from config)

    Returns:
        dict with audio info including duration and path
    """
    language = language or TTS_LANGUAGE

    # Generate TTS
    tts = gTTS(text=text, lang=language, slow=TTS_SPEED_SLOW)

    # Save to temp file first
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
        tts.save(tmp.name)
        tmp_path = tmp.name

    # Load with pydub to get duration
    audio = AudioSegment.from_mp3(tmp_path)
    duration_ms = len(audio)

    # Determine output path
    if output_path is None:
        output_path = OUTPUT_DIR / f"speech_{hash(text) % 100000}.mp3"
    else:
        output_path = Path(output_path)

    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Move temp file to output
    os.rename(tmp_path, output_path)

    return {
        "success": True,
        "path": str(output_path),
        "duration_ms": duration_ms,
        "text": text,
        "language": language
    }


def estimate_phoneme_timings(text: str, total_duration_ms: int) -> list[dict]:
    """
    Estimate phoneme timings based on text length and total audio duration.

    This is a simple estimation - for more accurate results, use a
    forced alignment tool like Montreal Forced Aligner.

    Args:
        text: The text being spoken
        total_duration_ms: Total audio duration in milliseconds

    Returns:
        List of dicts with phoneme, start_ms, end_ms, viseme
    """
    from .text_to_viseme import korean_text_to_visemes

    # Get viseme sequence with relative durations
    viseme_data = korean_text_to_visemes(text)

    if not viseme_data:
        return []

    # Calculate total relative duration
    total_relative = sum(item['duration'] for item in viseme_data)

    if total_relative == 0:
        return []

    # Scale to actual duration
    timings = []
    current_time = 0

    for item in viseme_data:
        relative_duration = item['duration']
        actual_duration = (relative_duration / total_relative) * total_duration_ms

        timings.append({
            'char': item['char'],
            'viseme': item['viseme'],
            'start_ms': int(current_time),
            'end_ms': int(current_time + actual_duration),
            'duration_ms': int(actual_duration)
        })

        current_time += actual_duration

    return timings


def generate_speech_with_timing(text: str, output_path: str | Path | None = None) -> dict:
    """
    Generate speech and phoneme timing information.

    Args:
        text: Text to convert to speech
        output_path: Path to save the audio file

    Returns:
        dict with audio path, duration, and phoneme timings
    """
    # Generate speech
    speech_result = generate_speech(text, output_path)

    if not speech_result["success"]:
        return speech_result

    # Estimate timings
    timings = estimate_phoneme_timings(text, speech_result["duration_ms"])

    return {
        **speech_result,
        "timings": timings
    }


def get_viseme_at_time(timings: list[dict], time_ms: int) -> str:
    """
    Get the viseme that should be displayed at a given time.

    Args:
        timings: List of timing dicts from estimate_phoneme_timings
        time_ms: Time in milliseconds

    Returns:
        Viseme code
    """
    for timing in timings:
        if timing['start_ms'] <= time_ms < timing['end_ms']:
            return timing['viseme']

    return 'rest'


def create_lipsync_timeline(text: str, fps: int = 24) -> list[dict]:
    """
    Create a frame-by-frame lipsync timeline.

    Args:
        text: Text being spoken
        fps: Frames per second

    Returns:
        List of dicts with frame_number, time_ms, viseme
    """
    # Generate speech to get duration
    speech_result = generate_speech_with_timing(text)

    if not speech_result["success"]:
        return []

    duration_ms = speech_result["duration_ms"]
    timings = speech_result["timings"]
    frame_duration_ms = 1000 / fps

    timeline = []
    frame = 0
    time_ms = 0

    while time_ms < duration_ms:
        viseme = get_viseme_at_time(timings, time_ms)
        timeline.append({
            'frame': frame,
            'time_ms': int(time_ms),
            'viseme': viseme
        })
        frame += 1
        time_ms += frame_duration_ms

    # Add final rest frame
    timeline.append({
        'frame': frame,
        'time_ms': int(duration_ms),
        'viseme': 'rest'
    })

    return timeline
