import gradio as gr from gtts import gTTS from pydub import AudioSegment import tempfile import os import numpy as np # テンプレート設定 TEMPLATES = { "パラオ高め(ポーランドボール風)": {"rate": 180, "volume": 1.0}, "低めのナレーター": {"rate": 120, "volume": 0.8}, "普通の話し方": {"rate": 150, "volume": 1.0}, "元気な女の子": {"rate": 180, "volume": 1.2}, "落ち着いた男性": {"rate": 130, "volume": 0.9}, "ロボット風(機械的)": {"rate": 140, "volume": 1.0}, "さっぱりした女性": {"rate": 160, "volume": 1.1}, "しっとりした声": {"rate": 140, "volume": 0.9}, "おじさん風": {"rate": 60, "volume": 0.75}, "怒った声": {"rate": 45, "volume": 0.9}, } EFFECTS = ["なし", "ふわふわ化", "かちかち化", "減衰", "リバーブ", "音揺れ"] def generate_tts(text, template_name, pitch_factor=1.0, speed_factor=1.0, effect_type="なし", effect_strength=1.0): # テンプレートの設定を反映 template = TEMPLATES.get(template_name, {"rate": 150, "volume": 1.0}) rate = template["rate"] * speed_factor # 速度調整 volume = template["volume"] # ボリューム調整 # 音声合成(Gtts使用) tts = gTTS(text=text, lang='ja') with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f: tts_path = f.name tts.save(tts_path) # 音声読み込み sound = AudioSegment.from_mp3(tts_path) # ピッチ変更 sound = change_pitch(sound, pitch_factor) # 速度変更 sound = change_speed(sound, rate / 100) # 速度が「%」であることを考慮 # エフェクト適用 sound = apply_effect(sound, effect_type, effect_strength) # 一時ファイルに保存 output_path = tts_path.replace(".mp3", "_modified.mp3") sound.export(output_path, format="mp3") return output_path def change_pitch(sound, factor): new_frame_rate = int(sound.frame_rate * factor) pitched_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate}) return pitched_sound.set_frame_rate(44100) def change_speed(sound, speed=1.0): new_frame_rate = int(sound.frame_rate * speed) sped_up_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate}) return sped_up_sound.set_frame_rate(44100) def apply_effect(sound, effect_type, effect_strength): if effect_type == "ふわふわ化": return sound.low_pass_filter(1000 * effect_strength) elif effect_type == "かちかち化": return sound.high_pass_filter(3000 * effect_strength) elif effect_type == "減衰": return sound.fade_out(int(len(sound) * effect_strength)) elif effect_type == "リバーブ": reversed_sound = sound.reverse() faded = reversed_sound.fade_in(200 * effect_strength).fade_out(200 * effect_strength) return (sound + faded.reverse()) - (10 * effect_strength) elif effect_type == "音揺れ": return wobble(sound, effect_strength) else: return sound def wobble(sound, strength): # 0.1秒ごとにランダムにピッチを揺らす(揺れを強くする) chunk_ms = 100 chunks = [sound[i:i+chunk_ms] for i in range(0, len(sound), chunk_ms)] wobbled = AudioSegment.empty() for chunk in chunks: pitch_shift = np.random.uniform(1 - 0.05 * strength, 1 + 0.05 * strength) # 強めの揺れ chunk = change_pitch(chunk, pitch_shift) wobbled += chunk return wobbled with gr.Blocks() as app: gr.Markdown("# オリジナル声読み上げ機") with gr.Row(): text_input = gr.Textbox(label="読み上げるテキスト", lines=2, placeholder="ここに入力...") with gr.Row(): template_dropdown = gr.Dropdown(choices=list(TEMPLATES.keys()), value="パラオ高め(ポーランドボール風)", label="テンプレートを選ぶ") with gr.Row(): pitch_slider = gr.Slider(0.1, 5.0, value=1.0, step=0.05, label="ピッチ倍率(高く・低く)") speed_slider = gr.Slider(0.1, 5.0, value=1.0, step=0.05, label="速度倍率(速く・遅く)") with gr.Row(): effect_dropdown = gr.Dropdown(choices=EFFECTS, value="なし", label="エフェクトを選ぶ") effect_strength_slider = gr.Slider(0.1, 10.0, value=1.0, step=0.05, label="エフェクト強さ") with gr.Row(): submit_btn = gr.Button("生成する") audio_output = gr.Audio(label="出力音声", type="filepath") submit_btn.click( fn=generate_tts, inputs=[text_input, template_dropdown, pitch_slider, speed_slider, effect_dropdown, effect_strength_slider], outputs=audio_output ) app.launch()