# Streaming Avatar 개발기 - v6: TTS + LLM 통합

## 개요

TTS(Text-to-Speech) 엔진과 LLM(Large Language Model)을 통합하여 자연스러운 대화가 가능한 아바타 시스템을 구현합니다.

## 아키텍처

```
┌─────────────────────────────────────────────────────────────────┐
│                    TTS + LLM Integration                         │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  [User Input]                                                   │
│       │                                                          │
│       ▼                                                          │
│  ┌─────────────────────────────────────────────────────────┐    │
│  │                   LLM Pipeline                           │    │
│  │                                                          │    │
│  │  ┌─────────────┐    ┌─────────────┐    ┌────────────┐   │    │
│  │  │   Context   │───▶│   Gemini    │───▶│  Response  │   │    │
│  │  │   Manager   │    │   2.0 Flash │    │  Streamer  │   │    │
│  │  └─────────────┘    └─────────────┘    └─────┬──────┘   │    │
│  │                                              │          │    │
│  └──────────────────────────────────────────────┼──────────┘    │
│                                                  │               │
│                          Streaming Text (chunk by chunk)        │
│                                                  │               │
│  ┌──────────────────────────────────────────────▼──────────┐    │
│  │                   TTS Pipeline                           │    │
│  │                                                          │    │
│  │  ┌─────────────┐    ┌─────────────┐    ┌────────────┐   │    │
│  │  │   Text      │───▶│  TTS Engine │───▶│   Audio    │   │    │
│  │  │   Chunker   │    │ (Streaming) │    │  Buffer    │   │    │
│  │  └─────────────┘    └─────────────┘    └─────┬──────┘   │    │
│  │                                              │          │    │
│  └──────────────────────────────────────────────┼──────────┘    │
│                                                  │               │
│                          Audio Stream → Lip Sync                │
│                                                  ▼               │
│                                           [MuseTalk]            │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘
```

## 1. LLM 통합 (Gemini)

### Gemini 스트리밍 클라이언트

```python
# src/llm/gemini_streaming.py
import google.generativeai as genai
from typing import AsyncIterator
from dataclasses import dataclass

@dataclass
class ConversationMessage:
    role: str  # "user" or "assistant"
    content: str

class GeminiStreamingClient:
    def __init__(
        self,
        api_key: str,
        model_name: str = "gemini-2.0-flash-exp",
        max_context_messages: int = 10
    ):
        genai.configure(api_key=api_key)

        self.model = genai.GenerativeModel(
            model_name,
            system_instruction=self._get_system_prompt()
        )

        self.conversation_history: list[ConversationMessage] = []
        self.max_context_messages = max_context_messages

    def _get_system_prompt(self) -> str:
        return """당신은 친절하고 자연스러운 AI 아바타입니다.

규칙:
1. 응답은 2-3문장으로 간결하게 작성합니다.
2. 자연스러운 구어체를 사용합니다.
3. 감정을 적절히 표현합니다.
4. 질문에는 직접적으로 답변합니다.

당신의 이름은 "아이리스"입니다."""

    def _build_messages(self, user_input: str) -> list[dict]:
        """Gemini API 형식으로 메시지 변환"""
        messages = []

        for msg in self.conversation_history[-self.max_context_messages:]:
            messages.append({
                "role": "user" if msg.role == "user" else "model",
                "parts": [msg.content]
            })

        messages.append({
            "role": "user",
            "parts": [user_input]
        })

        return messages

    async def generate_stream(
        self,
        user_input: str
    ) -> AsyncIterator[str]:
        """스트리밍 응답 생성"""

        # 대화 기록에 추가
        self.conversation_history.append(
            ConversationMessage(role="user", content=user_input)
        )

        messages = self._build_messages(user_input)

        # 스트리밍 응답 생성
        response = await self.model.generate_content_async(
            messages,
            stream=True,
            generation_config={
                "max_output_tokens": 200,
                "temperature": 0.8,
                "top_p": 0.9
            }
        )

        full_response = ""

        async for chunk in response:
            if chunk.text:
                full_response += chunk.text
                yield chunk.text

        # 응답을 대화 기록에 추가
        self.conversation_history.append(
            ConversationMessage(role="assistant", content=full_response)
        )

    def clear_history(self):
        """대화 기록 초기화"""
        self.conversation_history.clear()
```

### 문장 단위 청킹

```python
# src/llm/sentence_chunker.py
import re
from typing import AsyncIterator

class SentenceChunker:
    """LLM 스트리밍 출력을 문장 단위로 분할"""

    SENTENCE_ENDINGS = re.compile(r'[.!?。！？]\s*')

    def __init__(self, min_chunk_length: int = 10):
        self.buffer = ""
        self.min_chunk_length = min_chunk_length

    async def process_stream(
        self,
        text_stream: AsyncIterator[str]
    ) -> AsyncIterator[str]:
        """텍스트 스트림을 문장 단위로 분할"""

        async for text_chunk in text_stream:
            self.buffer += text_chunk

            # 문장 끝 찾기
            while True:
                match = self.SENTENCE_ENDINGS.search(self.buffer)

                if match and match.end() >= self.min_chunk_length:
                    # 완성된 문장 반환
                    sentence = self.buffer[:match.end()]
                    self.buffer = self.buffer[match.end():]
                    yield sentence.strip()
                else:
                    break

        # 남은 버퍼 반환
        if self.buffer.strip():
            yield self.buffer.strip()
            self.buffer = ""
```

## 2. TTS 엔진

### 스트리밍 TTS 기본 클래스

```python
# src/tts/base.py
from abc import ABC, abstractmethod
from typing import AsyncIterator
import numpy as np

class BaseTTSEngine(ABC):
    """TTS 엔진 기본 클래스"""

    @abstractmethod
    async def synthesize_stream(
        self,
        text: str,
        voice_id: str = None
    ) -> AsyncIterator[np.ndarray]:
        """텍스트를 오디오 스트림으로 변환"""
        pass

    @property
    @abstractmethod
    def sample_rate(self) -> int:
        """출력 샘플레이트"""
        pass
```

### Google TTS 구현

```python
# src/tts/google_tts.py
from google.cloud import texttospeech
from google.cloud.texttospeech_v1 import SynthesisInput
import numpy as np
import io
from .base import BaseTTSEngine

class GoogleTTSEngine(BaseTTSEngine):
    """Google Cloud TTS 엔진"""

    VOICE_MAP = {
        "ko-female-1": ("ko-KR", "ko-KR-Neural2-A"),
        "ko-female-2": ("ko-KR", "ko-KR-Neural2-B"),
        "ko-male-1": ("ko-KR", "ko-KR-Neural2-C"),
        "ko-male-2": ("ko-KR", "ko-KR-Neural2-D"),
        "en-female-1": ("en-US", "en-US-Neural2-C"),
        "en-male-1": ("en-US", "en-US-Neural2-D"),
    }

    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()
        self._sample_rate = 24000

    @property
    def sample_rate(self) -> int:
        return self._sample_rate

    async def synthesize_stream(
        self,
        text: str,
        voice_id: str = "ko-female-1"
    ) -> AsyncIterator[np.ndarray]:
        """Google TTS로 음성 합성"""

        language_code, voice_name = self.VOICE_MAP.get(
            voice_id,
            ("ko-KR", "ko-KR-Neural2-A")
        )

        synthesis_input = SynthesisInput(text=text)

        voice = texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_name
        )

        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
            sample_rate_hertz=self._sample_rate,
            speaking_rate=1.0,
            pitch=0.0
        )

        response = self.client.synthesize_speech(
            input=synthesis_input,
            voice=voice,
            audio_config=audio_config
        )

        # WAV 데이터를 numpy 배열로 변환
        audio_data = np.frombuffer(
            response.audio_content[44:],  # WAV 헤더 스킵
            dtype=np.int16
        ).astype(np.float32) / 32768.0

        # 청크 단위로 yield
        chunk_size = self._sample_rate // 10  # 100ms chunks
        for i in range(0, len(audio_data), chunk_size):
            yield audio_data[i:i + chunk_size]
```

### ElevenLabs 구현

```python
# src/tts/elevenlabs_tts.py
import httpx
import numpy as np
from typing import AsyncIterator
from .base import BaseTTSEngine

class ElevenLabsTTSEngine(BaseTTSEngine):
    """ElevenLabs 스트리밍 TTS 엔진"""

    API_URL = "https://api.elevenlabs.io/v1/text-to-speech"

    VOICE_MAP = {
        "rachel": "21m00Tcm4TlvDq8ikWAM",
        "domi": "AZnzlk1XvdvUeBnXmlld",
        "bella": "EXAVITQu4vr4xnSDxMaL",
        "antoni": "ErXwobaYiN019PkySvjV",
    }

    def __init__(self, api_key: str):
        self.api_key = api_key
        self._sample_rate = 24000

    @property
    def sample_rate(self) -> int:
        return self._sample_rate

    async def synthesize_stream(
        self,
        text: str,
        voice_id: str = "rachel"
    ) -> AsyncIterator[np.ndarray]:
        """ElevenLabs 스트리밍 TTS"""

        voice = self.VOICE_MAP.get(voice_id, voice_id)

        url = f"{self.API_URL}/{voice}/stream"

        headers = {
            "xi-api-key": self.api_key,
            "Content-Type": "application/json"
        }

        data = {
            "text": text,
            "model_id": "eleven_turbo_v2",
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75
            },
            "output_format": "pcm_24000"
        }

        async with httpx.AsyncClient() as client:
            async with client.stream(
                "POST",
                url,
                headers=headers,
                json=data,
                timeout=30.0
            ) as response:
                buffer = bytearray()

                async for chunk in response.aiter_bytes():
                    buffer.extend(chunk)

                    # 100ms 분량이 모이면 yield
                    chunk_bytes = self._sample_rate * 2 // 10  # 16-bit = 2 bytes
                    while len(buffer) >= chunk_bytes:
                        audio_chunk = bytes(buffer[:chunk_bytes])
                        buffer = buffer[chunk_bytes:]

                        audio_array = np.frombuffer(
                            audio_chunk,
                            dtype=np.int16
                        ).astype(np.float32) / 32768.0

                        yield audio_array

                # 남은 버퍼 처리
                if buffer:
                    audio_array = np.frombuffer(
                        bytes(buffer),
                        dtype=np.int16
                    ).astype(np.float32) / 32768.0
                    yield audio_array
```

## 3. 통합 파이프라인

### LLM-TTS 파이프라인

```python
# src/pipeline/llm_tts_pipeline.py
import asyncio
from typing import AsyncIterator
import numpy as np
from src.llm.gemini_streaming import GeminiStreamingClient
from src.llm.sentence_chunker import SentenceChunker
from src.tts.base import BaseTTSEngine

class LLMTTSPipeline:
    """LLM 응답을 TTS로 변환하는 통합 파이프라인"""

    def __init__(
        self,
        llm_client: GeminiStreamingClient,
        tts_engine: BaseTTSEngine,
        min_tts_length: int = 20
    ):
        self.llm = llm_client
        self.tts = tts_engine
        self.chunker = SentenceChunker(min_chunk_length=min_tts_length)

    async def process(
        self,
        user_input: str
    ) -> AsyncIterator[tuple[str, np.ndarray]]:
        """
        사용자 입력을 처리하여 (텍스트, 오디오) 쌍을 스트리밍

        Returns:
            AsyncIterator of (text_chunk, audio_chunk) tuples
        """

        # LLM 스트리밍 응답
        llm_stream = self.llm.generate_stream(user_input)

        # 문장 단위로 분할
        sentence_stream = self.chunker.process_stream(llm_stream)

        # 각 문장을 TTS로 변환
        async for sentence in sentence_stream:
            # TTS 변환
            async for audio_chunk in self.tts.synthesize_stream(sentence):
                yield (sentence, audio_chunk)
```

### 전체 아바타 파이프라인

```python
# src/pipeline/avatar_pipeline.py
import asyncio
from typing import AsyncIterator
import numpy as np
from dataclasses import dataclass

from src.pipeline.llm_tts_pipeline import LLMTTSPipeline
from src.lipsync.realtime_engine import RealtimeLipSyncEngine

@dataclass
class AvatarFrame:
    """아바타 프레임 데이터"""
    video_frame: np.ndarray  # RGB 이미지
    audio_chunk: np.ndarray  # PCM 오디오
    text: str               # 현재 발화 텍스트
    timestamp_ms: float     # 타임스탬프

class AvatarPipeline:
    """전체 아바타 파이프라인"""

    def __init__(
        self,
        llm_tts_pipeline: LLMTTSPipeline,
        lipsync_engine: RealtimeLipSyncEngine,
        avatar_path: str
    ):
        self.llm_tts = llm_tts_pipeline
        self.lipsync = lipsync_engine
        self.avatar_data = self.lipsync.preprocess_avatar(avatar_path)

        self.frame_rate = 25
        self.audio_sample_rate = 24000

    async def process_input(
        self,
        user_input: str
    ) -> AsyncIterator[AvatarFrame]:
        """
        사용자 입력을 처리하여 아바타 프레임 스트리밍

        Pipeline:
        1. User Input → LLM → Text Response (streaming)
        2. Text → TTS → Audio (streaming)
        3. Audio → Lip Sync → Video Frames
        """

        current_time_ms = 0
        audio_buffer = []
        current_text = ""

        async for text, audio_chunk in self.llm_tts.process(user_input):
            current_text = text
            audio_buffer.append(audio_chunk)

            # 충분한 오디오가 모이면 립싱크 생성
            combined_audio = np.concatenate(audio_buffer)
            audio_duration_ms = len(combined_audio) / self.audio_sample_rate * 1000

            if audio_duration_ms >= 100:  # 100ms 이상
                # 립싱크 프레임 생성
                frames = await self.lipsync.process_audio_chunk(
                    self.avatar_data,
                    combined_audio,
                    self.audio_sample_rate
                )

                # 프레임 yield
                frame_duration_ms = 1000 / self.frame_rate
                audio_per_frame = len(combined_audio) // len(frames)

                for i, frame in enumerate(frames):
                    audio_start = i * audio_per_frame
                    audio_end = (i + 1) * audio_per_frame
                    frame_audio = combined_audio[audio_start:audio_end]

                    yield AvatarFrame(
                        video_frame=frame,
                        audio_chunk=frame_audio,
                        text=current_text,
                        timestamp_ms=current_time_ms
                    )

                    current_time_ms += frame_duration_ms

                audio_buffer.clear()

        # 남은 오디오 처리
        if audio_buffer:
            combined_audio = np.concatenate(audio_buffer)
            frames = await self.lipsync.process_audio_chunk(
                self.avatar_data,
                combined_audio,
                self.audio_sample_rate
            )

            for frame in frames:
                yield AvatarFrame(
                    video_frame=frame,
                    audio_chunk=np.array([]),
                    text=current_text,
                    timestamp_ms=current_time_ms
                )
                current_time_ms += 1000 / self.frame_rate
```

## 4. 감정 표현

### 감정 감지 및 적용

```python
# src/emotion/detector.py
from enum import Enum
import re

class Emotion(Enum):
    NEUTRAL = "neutral"
    HAPPY = "happy"
    SAD = "sad"
    SURPRISED = "surprised"
    ANGRY = "angry"

class EmotionDetector:
    """텍스트에서 감정 감지"""

    PATTERNS = {
        Emotion.HAPPY: [
            r'[ㅋㅎ]{2,}',
            r'[!]{2,}',
            r'기쁘|좋아|행복|즐거|웃|반가',
            r'😀|😊|😁|🎉'
        ],
        Emotion.SAD: [
            r'[ㅠㅜ]{2,}',
            r'슬프|우울|안타|힘들|걱정',
            r'😢|😭|😞'
        ],
        Emotion.SURPRISED: [
            r'[?!]{2,}',
            r'놀라|깜짝|대박|헐|와',
            r'😮|😱|🤯'
        ],
        Emotion.ANGRY: [
            r'화나|짜증|열받|분노',
            r'😠|😤|💢'
        ]
    }

    def detect(self, text: str) -> Emotion:
        """텍스트에서 감정 감지"""
        for emotion, patterns in self.PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, text):
                    return emotion

        return Emotion.NEUTRAL

# src/emotion/expression.py
class EmotionExpression:
    """감정에 따른 아바타 표현 조정"""

    TTS_SETTINGS = {
        Emotion.NEUTRAL: {"speaking_rate": 1.0, "pitch": 0.0},
        Emotion.HAPPY: {"speaking_rate": 1.1, "pitch": 2.0},
        Emotion.SAD: {"speaking_rate": 0.9, "pitch": -2.0},
        Emotion.SURPRISED: {"speaking_rate": 1.2, "pitch": 4.0},
        Emotion.ANGRY: {"speaking_rate": 1.1, "pitch": -1.0},
    }

    FACE_ADJUSTMENTS = {
        Emotion.NEUTRAL: {"eyebrow_raise": 0, "eye_open": 1.0},
        Emotion.HAPPY: {"eyebrow_raise": 0.2, "eye_open": 0.8},
        Emotion.SAD: {"eyebrow_raise": -0.3, "eye_open": 0.7},
        Emotion.SURPRISED: {"eyebrow_raise": 0.5, "eye_open": 1.3},
        Emotion.ANGRY: {"eyebrow_raise": -0.4, "eye_open": 0.9},
    }
```

## 5. 테스트

```python
# tests/test_llm_tts_pipeline.py
import pytest
import asyncio
from src.pipeline.llm_tts_pipeline import LLMTTSPipeline

@pytest.mark.asyncio
async def test_pipeline_streaming():
    """파이프라인 스트리밍 테스트"""

    pipeline = LLMTTSPipeline(
        llm_client=GeminiStreamingClient(api_key="..."),
        tts_engine=GoogleTTSEngine()
    )

    chunks = []
    async for text, audio in pipeline.process("안녕하세요!"):
        chunks.append((text, audio))

    assert len(chunks) > 0
    assert all(len(audio) > 0 for _, audio in chunks)
```

## 다음 단계 (v7)

사용자 음성 입력(STT)을 추가하여 양방향 대화 시스템을 완성합니다.

---

*이 시리즈는 총 10개의 포스트로 구성되어 있습니다.*