# Streaming Avatar 개발기 - v6: TTS + LLM 통합 ## 개요 TTS(Text-to-Speech) 엔진과 LLM(Large Language Model)을 통합하여 자연스러운 대화가 가능한 아바타 시스템을 구현합니다. ## 아키텍처 ``` ┌─────────────────────────────────────────────────────────────────┐ │ TTS + LLM Integration │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ [User Input] │ │ │ │ │ ▼ │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ LLM Pipeline │ │ │ │ │ │ │ │ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ │ │ │ │ Context │───▶│ Gemini │───▶│ Response │ │ │ │ │ │ Manager │ │ 2.0 Flash │ │ Streamer │ │ │ │ │ └─────────────┘ └─────────────┘ └─────┬──────┘ │ │ │ │ │ │ │ │ └──────────────────────────────────────────────┼──────────┘ │ │ │ │ │ Streaming Text (chunk by chunk) │ │ │ │ │ ┌──────────────────────────────────────────────▼──────────┐ │ │ │ TTS Pipeline │ │ │ │ │ │ │ │ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ │ │ │ │ Text │───▶│ TTS Engine │───▶│ Audio │ │ │ │ │ │ Chunker │ │ (Streaming) │ │ Buffer │ │ │ │ │ └─────────────┘ └─────────────┘ └─────┬──────┘ │ │ │ │ │ │ │ │ └──────────────────────────────────────────────┼──────────┘ │ │ │ │ │ Audio Stream → Lip Sync │ │ ▼ │ │ [MuseTalk] │ │ │ └─────────────────────────────────────────────────────────────────┘ ``` ## 1. LLM 통합 (Gemini) ### Gemini 스트리밍 클라이언트 ```python # src/llm/gemini_streaming.py import google.generativeai as genai from typing import AsyncIterator from dataclasses import dataclass @dataclass class ConversationMessage: role: str # "user" or "assistant" content: str class GeminiStreamingClient: def __init__( self, api_key: str, model_name: str = "gemini-2.0-flash-exp", max_context_messages: int = 10 ): genai.configure(api_key=api_key) self.model = genai.GenerativeModel( model_name, system_instruction=self._get_system_prompt() ) self.conversation_history: list[ConversationMessage] = [] self.max_context_messages = max_context_messages def _get_system_prompt(self) -> str: return """당신은 친절하고 자연스러운 AI 아바타입니다. 규칙: 1. 응답은 2-3문장으로 간결하게 작성합니다. 2. 자연스러운 구어체를 사용합니다. 3. 감정을 적절히 표현합니다. 4. 질문에는 직접적으로 답변합니다. 당신의 이름은 "아이리스"입니다.""" def _build_messages(self, user_input: str) -> list[dict]: """Gemini API 형식으로 메시지 변환""" messages = [] for msg in self.conversation_history[-self.max_context_messages:]: messages.append({ "role": "user" if msg.role == "user" else "model", "parts": [msg.content] }) messages.append({ "role": "user", "parts": [user_input] }) return messages async def generate_stream( self, user_input: str ) -> AsyncIterator[str]: """스트리밍 응답 생성""" # 대화 기록에 추가 self.conversation_history.append( ConversationMessage(role="user", content=user_input) ) messages = self._build_messages(user_input) # 스트리밍 응답 생성 response = await self.model.generate_content_async( messages, stream=True, generation_config={ "max_output_tokens": 200, "temperature": 0.8, "top_p": 0.9 } ) full_response = "" async for chunk in response: if chunk.text: full_response += chunk.text yield chunk.text # 응답을 대화 기록에 추가 self.conversation_history.append( ConversationMessage(role="assistant", content=full_response) ) def clear_history(self): """대화 기록 초기화""" self.conversation_history.clear() ``` ### 문장 단위 청킹 ```python # src/llm/sentence_chunker.py import re from typing import AsyncIterator class SentenceChunker: """LLM 스트리밍 출력을 문장 단위로 분할""" SENTENCE_ENDINGS = re.compile(r'[.!?。!?]\s*') def __init__(self, min_chunk_length: int = 10): self.buffer = "" self.min_chunk_length = min_chunk_length async def process_stream( self, text_stream: AsyncIterator[str] ) -> AsyncIterator[str]: """텍스트 스트림을 문장 단위로 분할""" async for text_chunk in text_stream: self.buffer += text_chunk # 문장 끝 찾기 while True: match = self.SENTENCE_ENDINGS.search(self.buffer) if match and match.end() >= self.min_chunk_length: # 완성된 문장 반환 sentence = self.buffer[:match.end()] self.buffer = self.buffer[match.end():] yield sentence.strip() else: break # 남은 버퍼 반환 if self.buffer.strip(): yield self.buffer.strip() self.buffer = "" ``` ## 2. TTS 엔진 ### 스트리밍 TTS 기본 클래스 ```python # src/tts/base.py from abc import ABC, abstractmethod from typing import AsyncIterator import numpy as np class BaseTTSEngine(ABC): """TTS 엔진 기본 클래스""" @abstractmethod async def synthesize_stream( self, text: str, voice_id: str = None ) -> AsyncIterator[np.ndarray]: """텍스트를 오디오 스트림으로 변환""" pass @property @abstractmethod def sample_rate(self) -> int: """출력 샘플레이트""" pass ``` ### Google TTS 구현 ```python # src/tts/google_tts.py from google.cloud import texttospeech from google.cloud.texttospeech_v1 import SynthesisInput import numpy as np import io from .base import BaseTTSEngine class GoogleTTSEngine(BaseTTSEngine): """Google Cloud TTS 엔진""" VOICE_MAP = { "ko-female-1": ("ko-KR", "ko-KR-Neural2-A"), "ko-female-2": ("ko-KR", "ko-KR-Neural2-B"), "ko-male-1": ("ko-KR", "ko-KR-Neural2-C"), "ko-male-2": ("ko-KR", "ko-KR-Neural2-D"), "en-female-1": ("en-US", "en-US-Neural2-C"), "en-male-1": ("en-US", "en-US-Neural2-D"), } def __init__(self): self.client = texttospeech.TextToSpeechClient() self._sample_rate = 24000 @property def sample_rate(self) -> int: return self._sample_rate async def synthesize_stream( self, text: str, voice_id: str = "ko-female-1" ) -> AsyncIterator[np.ndarray]: """Google TTS로 음성 합성""" language_code, voice_name = self.VOICE_MAP.get( voice_id, ("ko-KR", "ko-KR-Neural2-A") ) synthesis_input = SynthesisInput(text=text) voice = texttospeech.VoiceSelectionParams( language_code=language_code, name=voice_name ) audio_config = texttospeech.AudioConfig( audio_encoding=texttospeech.AudioEncoding.LINEAR16, sample_rate_hertz=self._sample_rate, speaking_rate=1.0, pitch=0.0 ) response = self.client.synthesize_speech( input=synthesis_input, voice=voice, audio_config=audio_config ) # WAV 데이터를 numpy 배열로 변환 audio_data = np.frombuffer( response.audio_content[44:], # WAV 헤더 스킵 dtype=np.int16 ).astype(np.float32) / 32768.0 # 청크 단위로 yield chunk_size = self._sample_rate // 10 # 100ms chunks for i in range(0, len(audio_data), chunk_size): yield audio_data[i:i + chunk_size] ``` ### ElevenLabs 구현 ```python # src/tts/elevenlabs_tts.py import httpx import numpy as np from typing import AsyncIterator from .base import BaseTTSEngine class ElevenLabsTTSEngine(BaseTTSEngine): """ElevenLabs 스트리밍 TTS 엔진""" API_URL = "https://api.elevenlabs.io/v1/text-to-speech" VOICE_MAP = { "rachel": "21m00Tcm4TlvDq8ikWAM", "domi": "AZnzlk1XvdvUeBnXmlld", "bella": "EXAVITQu4vr4xnSDxMaL", "antoni": "ErXwobaYiN019PkySvjV", } def __init__(self, api_key: str): self.api_key = api_key self._sample_rate = 24000 @property def sample_rate(self) -> int: return self._sample_rate async def synthesize_stream( self, text: str, voice_id: str = "rachel" ) -> AsyncIterator[np.ndarray]: """ElevenLabs 스트리밍 TTS""" voice = self.VOICE_MAP.get(voice_id, voice_id) url = f"{self.API_URL}/{voice}/stream" headers = { "xi-api-key": self.api_key, "Content-Type": "application/json" } data = { "text": text, "model_id": "eleven_turbo_v2", "voice_settings": { "stability": 0.5, "similarity_boost": 0.75 }, "output_format": "pcm_24000" } async with httpx.AsyncClient() as client: async with client.stream( "POST", url, headers=headers, json=data, timeout=30.0 ) as response: buffer = bytearray() async for chunk in response.aiter_bytes(): buffer.extend(chunk) # 100ms 분량이 모이면 yield chunk_bytes = self._sample_rate * 2 // 10 # 16-bit = 2 bytes while len(buffer) >= chunk_bytes: audio_chunk = bytes(buffer[:chunk_bytes]) buffer = buffer[chunk_bytes:] audio_array = np.frombuffer( audio_chunk, dtype=np.int16 ).astype(np.float32) / 32768.0 yield audio_array # 남은 버퍼 처리 if buffer: audio_array = np.frombuffer( bytes(buffer), dtype=np.int16 ).astype(np.float32) / 32768.0 yield audio_array ``` ## 3. 통합 파이프라인 ### LLM-TTS 파이프라인 ```python # src/pipeline/llm_tts_pipeline.py import asyncio from typing import AsyncIterator import numpy as np from src.llm.gemini_streaming import GeminiStreamingClient from src.llm.sentence_chunker import SentenceChunker from src.tts.base import BaseTTSEngine class LLMTTSPipeline: """LLM 응답을 TTS로 변환하는 통합 파이프라인""" def __init__( self, llm_client: GeminiStreamingClient, tts_engine: BaseTTSEngine, min_tts_length: int = 20 ): self.llm = llm_client self.tts = tts_engine self.chunker = SentenceChunker(min_chunk_length=min_tts_length) async def process( self, user_input: str ) -> AsyncIterator[tuple[str, np.ndarray]]: """ 사용자 입력을 처리하여 (텍스트, 오디오) 쌍을 스트리밍 Returns: AsyncIterator of (text_chunk, audio_chunk) tuples """ # LLM 스트리밍 응답 llm_stream = self.llm.generate_stream(user_input) # 문장 단위로 분할 sentence_stream = self.chunker.process_stream(llm_stream) # 각 문장을 TTS로 변환 async for sentence in sentence_stream: # TTS 변환 async for audio_chunk in self.tts.synthesize_stream(sentence): yield (sentence, audio_chunk) ``` ### 전체 아바타 파이프라인 ```python # src/pipeline/avatar_pipeline.py import asyncio from typing import AsyncIterator import numpy as np from dataclasses import dataclass from src.pipeline.llm_tts_pipeline import LLMTTSPipeline from src.lipsync.realtime_engine import RealtimeLipSyncEngine @dataclass class AvatarFrame: """아바타 프레임 데이터""" video_frame: np.ndarray # RGB 이미지 audio_chunk: np.ndarray # PCM 오디오 text: str # 현재 발화 텍스트 timestamp_ms: float # 타임스탬프 class AvatarPipeline: """전체 아바타 파이프라인""" def __init__( self, llm_tts_pipeline: LLMTTSPipeline, lipsync_engine: RealtimeLipSyncEngine, avatar_path: str ): self.llm_tts = llm_tts_pipeline self.lipsync = lipsync_engine self.avatar_data = self.lipsync.preprocess_avatar(avatar_path) self.frame_rate = 25 self.audio_sample_rate = 24000 async def process_input( self, user_input: str ) -> AsyncIterator[AvatarFrame]: """ 사용자 입력을 처리하여 아바타 프레임 스트리밍 Pipeline: 1. User Input → LLM → Text Response (streaming) 2. Text → TTS → Audio (streaming) 3. Audio → Lip Sync → Video Frames """ current_time_ms = 0 audio_buffer = [] current_text = "" async for text, audio_chunk in self.llm_tts.process(user_input): current_text = text audio_buffer.append(audio_chunk) # 충분한 오디오가 모이면 립싱크 생성 combined_audio = np.concatenate(audio_buffer) audio_duration_ms = len(combined_audio) / self.audio_sample_rate * 1000 if audio_duration_ms >= 100: # 100ms 이상 # 립싱크 프레임 생성 frames = await self.lipsync.process_audio_chunk( self.avatar_data, combined_audio, self.audio_sample_rate ) # 프레임 yield frame_duration_ms = 1000 / self.frame_rate audio_per_frame = len(combined_audio) // len(frames) for i, frame in enumerate(frames): audio_start = i * audio_per_frame audio_end = (i + 1) * audio_per_frame frame_audio = combined_audio[audio_start:audio_end] yield AvatarFrame( video_frame=frame, audio_chunk=frame_audio, text=current_text, timestamp_ms=current_time_ms ) current_time_ms += frame_duration_ms audio_buffer.clear() # 남은 오디오 처리 if audio_buffer: combined_audio = np.concatenate(audio_buffer) frames = await self.lipsync.process_audio_chunk( self.avatar_data, combined_audio, self.audio_sample_rate ) for frame in frames: yield AvatarFrame( video_frame=frame, audio_chunk=np.array([]), text=current_text, timestamp_ms=current_time_ms ) current_time_ms += 1000 / self.frame_rate ``` ## 4. 감정 표현 ### 감정 감지 및 적용 ```python # src/emotion/detector.py from enum import Enum import re class Emotion(Enum): NEUTRAL = "neutral" HAPPY = "happy" SAD = "sad" SURPRISED = "surprised" ANGRY = "angry" class EmotionDetector: """텍스트에서 감정 감지""" PATTERNS = { Emotion.HAPPY: [ r'[ㅋㅎ]{2,}', r'[!]{2,}', r'기쁘|좋아|행복|즐거|웃|반가', r'😀|😊|😁|🎉' ], Emotion.SAD: [ r'[ㅠㅜ]{2,}', r'슬프|우울|안타|힘들|걱정', r'😢|😭|😞' ], Emotion.SURPRISED: [ r'[?!]{2,}', r'놀라|깜짝|대박|헐|와', r'😮|😱|🤯' ], Emotion.ANGRY: [ r'화나|짜증|열받|분노', r'😠|😤|💢' ] } def detect(self, text: str) -> Emotion: """텍스트에서 감정 감지""" for emotion, patterns in self.PATTERNS.items(): for pattern in patterns: if re.search(pattern, text): return emotion return Emotion.NEUTRAL # src/emotion/expression.py class EmotionExpression: """감정에 따른 아바타 표현 조정""" TTS_SETTINGS = { Emotion.NEUTRAL: {"speaking_rate": 1.0, "pitch": 0.0}, Emotion.HAPPY: {"speaking_rate": 1.1, "pitch": 2.0}, Emotion.SAD: {"speaking_rate": 0.9, "pitch": -2.0}, Emotion.SURPRISED: {"speaking_rate": 1.2, "pitch": 4.0}, Emotion.ANGRY: {"speaking_rate": 1.1, "pitch": -1.0}, } FACE_ADJUSTMENTS = { Emotion.NEUTRAL: {"eyebrow_raise": 0, "eye_open": 1.0}, Emotion.HAPPY: {"eyebrow_raise": 0.2, "eye_open": 0.8}, Emotion.SAD: {"eyebrow_raise": -0.3, "eye_open": 0.7}, Emotion.SURPRISED: {"eyebrow_raise": 0.5, "eye_open": 1.3}, Emotion.ANGRY: {"eyebrow_raise": -0.4, "eye_open": 0.9}, } ``` ## 5. 테스트 ```python # tests/test_llm_tts_pipeline.py import pytest import asyncio from src.pipeline.llm_tts_pipeline import LLMTTSPipeline @pytest.mark.asyncio async def test_pipeline_streaming(): """파이프라인 스트리밍 테스트""" pipeline = LLMTTSPipeline( llm_client=GeminiStreamingClient(api_key="..."), tts_engine=GoogleTTSEngine() ) chunks = [] async for text, audio in pipeline.process("안녕하세요!"): chunks.append((text, audio)) assert len(chunks) > 0 assert all(len(audio) > 0 for _, audio in chunks) ``` ## 다음 단계 (v7) 사용자 음성 입력(STT)을 추가하여 양방향 대화 시스템을 완성합니다. --- *이 시리즈는 총 10개의 포스트로 구성되어 있습니다.*