o
    U6i                  	   @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ dded	eeB dB d
edefddZdededee fddZdded	eeB dB defddZdee dedefddZddededee fddZdS )z.Text-to-Speech engine with timing information.    N)Path)gTTS)AudioSegment   )TTS_LANGUAGETTS_SPEED_SLOW
OUTPUT_DIRtextoutput_pathlanguagereturnc                 C   s   |pt }t| |td}tjddd}||j |j}W d   n1 s&w   Y  t|}t	|}|du rEt
dt| d  d }nt|}|jjddd	 t|| dt||| |d
S )a(  
    Generate speech audio from text using Google TTS.

    Args:
        text: Text to convert to speech
        output_path: Path to save the audio file (optional)
        language: Language code (default: from config)

    Returns:
        dict with audio info including duration and path
    )r	   langZslowz.mp3F)suffixdeleteNZspeech_i T)parentsexist_ok)successpathduration_msr	   r   )r   r   r   tempfileNamedTemporaryFilesavenamer   Zfrom_mp3lenr   hashr   parentmkdirosrenamestr)r	   r
   r   ZttstmpZtmp_pathaudior    r"   M/var/www/tkim.planitai.co.jp/blog/20251208-make-apng-tool/./src/tts_engine.pygenerate_speech
   s&   
r$   total_duration_msc           
   	   C   s   ddl m} || }|sg S tdd |D }|dkrg S g }d}|D ]'}|d }|| | }	||d |d t|t||	 t|	d	 ||	7 }q#|S )
a  
    Estimate phoneme timings based on text length and total audio duration.

    This is a simple estimation - for more accurate results, use a
    forced alignment tool like Montreal Forced Aligner.

    Args:
        text: The text being spoken
        total_duration_ms: Total audio duration in milliseconds

    Returns:
        List of dicts with phoneme, start_ms, end_ms, viseme
    r   )korean_text_to_visemesc                 s   s    | ]}|d  V  qdS )durationNr"   ).0itemr"   r"   r#   	<genexpr>O   s    z+estimate_phoneme_timings.<locals>.<genexpr>r   r'   charviseme)r+   r,   start_msend_msr   )text_to_visemer&   sumappendint)
r	   r%   r&   Zviseme_dataZtotal_relativetimingscurrent_timer)   Zrelative_durationZactual_durationr"   r"   r#   estimate_phoneme_timings8   s*   

r5   c                 C   s4   t | |}|d s|S t| |d }i |d|iS )z
    Generate speech and phoneme timing information.

    Args:
        text: Text to convert to speech
        output_path: Path to save the audio file

    Returns:
        dict with audio path, duration, and phoneme timings
    r   r   r3   )r$   r5   )r	   r
   speech_resultr3   r"   r"   r#   generate_speech_with_timingi   s   
r7   r3   time_msc                 C   s:   | D ]}|d |  kr|d k rn q|d   S qdS )z
    Get the viseme that should be displayed at a given time.

    Args:
        timings: List of timing dicts from estimate_phoneme_timings
        time_ms: Time in milliseconds

    Returns:
        Viseme code
    r-   r.   r,   restr"   )r3   r8   Ztimingr"   r"   r#   get_viseme_at_time   s
    r:      fpsc           
      C   s   t | }|d s
g S |d }|d }d| }g }d}d}||k r<t||}	||t||	d |d7 }||7 }||k s ||t|dd |S )	z
    Create a frame-by-frame lipsync timeline.

    Args:
        text: Text being spoken
        fps: Frames per second

    Returns:
        List of dicts with frame_number, time_ms, viseme
    r   r   r3   i  r   )framer8   r,   r   r9   )r7   r:   r1   r2   )
r	   r<   r6   r   r3   Zframe_duration_msZtimeliner=   r8   r,   r"   r"   r#   create_lipsync_timeline   s2   
r>   )NN)N)r;   )__doc__r   r   pathlibr   Zgttsr   Zpydubr   configr   r   r   r   dictr$   r2   listr5   r7   r:   r>   r"   r"   r"   r#   <module>   s    $. 1 