sungo-ganpare commited on
Commit
3ff2783
·
1 Parent(s): 8263cff
Files changed (3) hide show
  1. app_wsl copy.py +0 -669
  2. app_wsl.py +4 -39
  3. run_parakeet.bat +37 -23
app_wsl copy.py DELETED
@@ -1,669 +0,0 @@
1
- from nemo.collections.asr.models import ASRModel
2
- import torch
3
- import gradio as gr
4
- import spaces
5
- import gc
6
- import shutil
7
- from pathlib import Path
8
- from pydub import AudioSegment
9
- import numpy as np
10
- import os
11
- import gradio.themes as gr_themes
12
- import csv
13
- import json
14
- from typing import List, Tuple
15
-
16
- device = "cuda" if torch.cuda.is_available() else "cpu"
17
- MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
18
-
19
- model = ASRModel.from_pretrained(model_name=MODEL_NAME)
20
- model.eval()
21
-
22
- def start_session(request: gr.Request):
23
- session_hash = request.session_hash
24
- # プロジェクトディレクトリ内のoutputsフォルダを使用
25
- base_dir = Path(__file__).parent
26
- session_dir = base_dir / "outputs" / session_hash
27
- session_dir.mkdir(parents=True, exist_ok=True)
28
- print(f"Session with hash {session_hash} started in {session_dir}")
29
- return session_dir.as_posix()
30
-
31
- def end_session(request: gr.Request):
32
- session_hash = request.session_hash
33
- base_dir = Path(__file__).parent
34
- session_dir = base_dir / "outputs" / session_hash
35
- if session_dir.exists():
36
- print(f"Session directory {session_dir} will be preserved.")
37
- # 削除しないように変更
38
- # shutil.rmtree(session_dir)
39
- print(f"Session with hash {session_hash} ended.")
40
-
41
- def get_audio_segment(audio_path, start_second, end_second):
42
- if not audio_path or not Path(audio_path).exists():
43
- print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
44
- return None
45
- try:
46
- start_ms = int(start_second * 1000)
47
- end_ms = int(end_second * 1000)
48
-
49
- start_ms = max(0, start_ms)
50
- if end_ms <= start_ms:
51
- print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
52
- end_ms = start_ms + 100
53
-
54
- audio = AudioSegment.from_file(audio_path)
55
- clipped_audio = audio[start_ms:end_ms]
56
-
57
- samples = np.array(clipped_audio.get_array_of_samples())
58
- if clipped_audio.channels == 2:
59
- samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
60
-
61
- frame_rate = clipped_audio.frame_rate
62
- if frame_rate <= 0:
63
- print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
64
- frame_rate = audio.frame_rate
65
-
66
- if samples.size == 0:
67
- print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
68
- return None
69
-
70
- return (frame_rate, samples)
71
- except FileNotFoundError:
72
- print(f"Error: Audio file not found at path: {audio_path}")
73
- return None
74
- except Exception as e:
75
- print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
76
- return None
77
-
78
- def preprocess_audio(audio_path, session_dir):
79
- """
80
- オーディオファイルの前処理(リサンプリング、モノラル変換)を行う。
81
-
82
- Args:
83
- audio_path (str): 入力オーディオファイルのパス。
84
- session_dir (str): セッションディレクトリのパス。
85
-
86
- Returns:
87
- tuple: (processed_path, info_path_name, duration_sec) のタプル、または None(処理に失敗した場合)。
88
- """
89
- try:
90
- original_path_name = Path(audio_path).name
91
- audio_name = Path(audio_path).stem
92
-
93
- try:
94
- gr.Info(f"Loading audio: {original_path_name}", duration=2)
95
- audio = AudioSegment.from_file(audio_path)
96
- duration_sec = audio.duration_seconds
97
- except Exception as load_e:
98
- gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
99
- return None, None, None
100
-
101
- resampled = False
102
- mono = False
103
- target_sr = 16000
104
-
105
- if audio.frame_rate != target_sr:
106
- try:
107
- audio = audio.set_frame_rate(target_sr)
108
- resampled = True
109
- except Exception as resample_e:
110
- gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
111
- return None, None, None
112
-
113
- if audio.channels == 2:
114
- try:
115
- audio = audio.set_channels(1)
116
- mono = True
117
- except Exception as mono_e:
118
- gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
119
- return None, None, None
120
- elif audio.channels > 2:
121
- gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
122
- return None, None, None
123
-
124
- processed_audio_path = None
125
- if resampled or mono:
126
- try:
127
- processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
128
- audio.export(processed_audio_path, format="wav")
129
- transcribe_path = processed_audio_path.as_posix()
130
- info_path_name = f"{original_path_name} (processed)"
131
- except Exception as export_e:
132
- gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
133
- if processed_audio_path and os.path.exists(processed_audio_path):
134
- os.remove(processed_audio_path)
135
- return None, None, None
136
- else:
137
- transcribe_path = audio_path
138
- info_path_name = original_path_name
139
-
140
- return transcribe_path, info_path_name, duration_sec
141
- except Exception as e:
142
- gr.Error(f"Audio preprocessing failed: {e}", duration=None)
143
- return None, None, None
144
-
145
- def transcribe_audio(transcribe_path, model, duration_sec, device):
146
- """
147
- オーディオファイルを文字起こしし、タイムスタンプを取得する。
148
-
149
- Args:
150
- transcribe_path (str): 入力オーディオファイルのパス。
151
- model (ASRModel): 使用するASRモデル。
152
- duration_sec (float): オーディオファイルの長さ(秒)。
153
- device (str): 使用するデバイス('cuda' or 'cpu')。
154
-
155
- Returns:
156
- tuple: (vis_data, raw_times_data, word_vis_data) のタプル、または None(処理に失敗した場合)。
157
- """
158
- long_audio_settings_applied = False
159
- try:
160
- # CUDA使用前にメモリをクリア
161
- if device == 'cuda':
162
- torch.cuda.empty_cache()
163
- gc.collect()
164
-
165
- model.to(device)
166
- model.to(torch.float32)
167
- gr.Info(f"Transcribing on {device}...", duration=2)
168
-
169
- if duration_sec > 480:
170
- try:
171
- gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3)
172
- print("Applying long audio settings: Local Attention and Chunking.")
173
- model.change_attention_model("rel_pos_local_attn", [256,256])
174
- model.change_subsampling_conv_chunking_factor(1)
175
- # メモリ効率を改善するための設定
176
- torch.cuda.empty_cache()
177
- gc.collect()
178
- long_audio_settings_applied = True
179
- except Exception as setting_e:
180
- gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
181
- print(f"Warning: Failed to apply long audio settings: {setting_e}")
182
-
183
- # より効率的なメモリ使用のためにbfloat16を使用
184
- model.to(torch.bfloat16)
185
-
186
- # メモリ使用状況をログに出力
187
- if device == 'cuda':
188
- print(f"CUDA Memory before transcription: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
189
-
190
- output = model.transcribe([transcribe_path], timestamps=True)
191
-
192
- if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
193
- gr.Error("Transcription failed or produced unexpected output format.", duration=None)
194
- return None, None, None
195
-
196
- # 結果を処理する前にメモリを解放
197
- if device == 'cuda':
198
- model.cpu()
199
- torch.cuda.empty_cache()
200
- gc.collect()
201
-
202
- segment_timestamps = output[0].timestamp['segment']
203
- vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
204
- raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
205
-
206
- word_timestamps_raw = output[0].timestamp.get("word", [])
207
- word_vis_data = [
208
- [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
209
- for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
210
- ]
211
-
212
- gr.Info("Transcription complete.", duration=2)
213
- return vis_data, raw_times_data, word_vis_data
214
-
215
- except torch.cuda.OutOfMemoryError as e:
216
- error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
217
- print(f"CUDA OutOfMemoryError: {e}")
218
- gr.Error(error_msg, duration=None)
219
- # メモリエラー時に強制的にクリーンアップ
220
- if device == 'cuda':
221
- torch.cuda.empty_cache()
222
- gc.collect()
223
- return None, None, None
224
-
225
- except Exception as e:
226
- error_msg = f"Transcription failed: {e}"
227
- print(f"Error during transcription processing: {e}")
228
- gr.Error(error_msg, duration=None)
229
- return None, None, None
230
-
231
- finally:
232
- try:
233
- if long_audio_settings_applied:
234
- try:
235
- print("Reverting long audio settings.")
236
- model.change_attention_model("rel_pos")
237
- model.change_subsampling_conv_chunking_factor(-1)
238
- except Exception as revert_e:
239
- print(f"Warning: Failed to revert long audio settings: {revert_e}")
240
- gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
241
-
242
- if device == 'cuda':
243
- model.cpu()
244
- torch.cuda.empty_cache()
245
- gc.collect()
246
- except Exception as cleanup_e:
247
- print(f"Error during model cleanup: {cleanup_e}")
248
- gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
249
-
250
- def save_transcripts(session_dir, audio_name, vis_data, word_vis_data):
251
- """
252
- 文字起こし結果を各種ファイル形式(CSV、SRT、VTT、JSON、LRC)で保存する。
253
-
254
- Args:
255
- session_dir (str): セッションディレクトリのパス。
256
- audio_name (str): オーディオファイルの名前。
257
- vis_data (list): 表示用の文字起こし結果のリスト。
258
- word_vis_data (list): 単語レベルのタイムスタンプのリスト。
259
-
260
- Returns:
261
- tuple: 各ファイルのダウンロードボタンの更新情報を含むタプル。
262
- """
263
- try:
264
- csv_headers = ["Start (s)", "End (s)", "Segment"]
265
- csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
266
- with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
267
- writer = csv.writer(f)
268
- writer.writerow(csv_headers)
269
- writer.writerows(vis_data)
270
- print(f"CSV transcript saved to temporary file: {csv_file_path}")
271
-
272
- srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
273
- vtt_file_path = Path(session_dir, f"transcription_{audio_name}.vtt")
274
- json_file_path = Path(session_dir, f"transcription_{audio_name}.json")
275
- write_srt(vis_data, srt_file_path)
276
- write_vtt(vis_data, word_vis_data, vtt_file_path)
277
- write_json(vis_data, word_vis_data, json_file_path)
278
- print(f"SRT, VTT, JSON transcript saved to temporary files: {srt_file_path}, {vtt_file_path}, {json_file_path}")
279
-
280
- lrc_file_path = Path(session_dir, f"transcription_{audio_name}.lrc")
281
- write_lrc(vis_data, lrc_file_path)
282
- print(f"LRC transcript saved to temporary file: {lrc_file_path}")
283
-
284
- return (
285
- gr.DownloadButton(value=csv_file_path.as_posix(), visible=True),
286
- gr.DownloadButton(value=srt_file_path.as_posix(), visible=True),
287
- gr.DownloadButton(value=vtt_file_path.as_posix(), visible=True),
288
- gr.DownloadButton(value=json_file_path.as_posix(), visible=True),
289
- gr.DownloadButton(value=lrc_file_path.as_posix(), visible=True)
290
- )
291
- except Exception as e:
292
- gr.Error(f"Failed to create transcript files: {e}", duration=None)
293
- print(f"Error writing transcript files: {e}")
294
- return tuple([gr.DownloadButton(visible=False)] * 5)
295
-
296
- def split_audio_with_overlap(audio_path: str, session_dir: str, chunk_length_sec: int = 3600, overlap_sec: int = 30) -> List[str]:
297
- """
298
- 音声ファイルをchunk_length_secごとにoverlap_secのオーバーラップ付きで分割し、
299
- 分割ファイルのパスリストを返す。
300
- """
301
- audio = AudioSegment.from_file(audio_path)
302
- duration = audio.duration_seconds
303
- chunk_paths = []
304
- start = 0
305
- chunk_idx = 0
306
- while start < duration:
307
- end = min(start + chunk_length_sec, duration)
308
- # オーバーラップを考慮
309
- chunk_start = max(0, start - (overlap_sec if start > 0 else 0))
310
- chunk_end = min(end + (overlap_sec if end < duration else 0), duration)
311
- chunk = audio[chunk_start * 1000:chunk_end * 1000]
312
- chunk_path = Path(session_dir, f"chunk_{chunk_idx:03d}.wav").as_posix()
313
- chunk.export(chunk_path, format="wav")
314
- chunk_paths.append(chunk_path)
315
- start += chunk_length_sec
316
- chunk_idx += 1
317
- return chunk_paths
318
-
319
- @spaces.GPU
320
- def get_transcripts_and_raw_times(audio_path, session_dir, progress=gr.Progress(track_tqdm=True)):
321
- """
322
- オーディオファイルを処理し、文字起こし結果を生成する。
323
- 3時間を超える場合は60分ごとに分割し、オーバーラップ付きでASRを実行してマージする。
324
- """
325
- if not audio_path:
326
- gr.Error("No audio file path provided for transcription.", duration=None)
327
- return [], [], [], None, gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False)
328
-
329
- audio_name = Path(audio_path).stem
330
- processed_audio_path = None
331
- temp_chunk_paths = []
332
-
333
- try:
334
- # オーディオの前処理
335
- transcribe_path, info_path_name, duration_sec = preprocess_audio(audio_path, session_dir)
336
- if not transcribe_path or not duration_sec:
337
- return [], [], [], audio_path, gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False)
338
-
339
- processed_audio_path = transcribe_path if transcribe_path != audio_path else None # 3時間超の場合は分割して逐次ASR
340
- if duration_sec > 10800:
341
- gr.Info("Audio is longer than 3 hours. Splitting into 1-hour chunks with overlap for transcription.", duration=5)
342
- chunk_paths = split_audio_with_overlap(transcribe_path, session_dir, chunk_length_sec=3600, overlap_sec=30)
343
- temp_chunk_paths = chunk_paths.copy()
344
- all_vis_data = []
345
- all_raw_times_data = []
346
- all_word_vis_data = []
347
- offset = 0.0
348
- prev_end = 0.0
349
- for i, chunk_path in enumerate(progress.tqdm(chunk_paths, desc="Processing audio chunks")):
350
- chunk_audio = AudioSegment.from_file(chunk_path)
351
- chunk_duration = chunk_audio.duration_seconds
352
- # ASR実行
353
- result = transcribe_audio(chunk_path, model, chunk_duration, device)
354
- if not result:
355
- continue
356
- vis_data, raw_times_data, word_vis_data = result
357
- # タイムスタンプを全体のオフセットに合わせて補正
358
- vis_data_offset = []
359
- raw_times_data_offset = []
360
- word_vis_data_offset = []
361
- for row in vis_data:
362
- s, e, seg = float(row[0]), float(row[1]), row[2]
363
- vis_data_offset.append([f"{s+offset:.2f}", f"{e+offset:.2f}", seg])
364
- for row in raw_times_data:
365
- s, e = float(row[0]), float(row[1])
366
- raw_times_data_offset.append([s+offset, e+offset])
367
- for row in word_vis_data:
368
- s, e, w = float(row[0]), float(row[1]), row[2]
369
- word_vis_data_offset.append([f"{s+offset:.2f}", f"{e+offset:.2f}", w])
370
- # オーバーラップ部分の重複除去(単純に前回のend以降のみ追加)
371
- vis_data_offset = [row for row in vis_data_offset if float(row[0]) >= prev_end]
372
- raw_times_data_offset = [row for row in raw_times_data_offset if row[0] >= prev_end]
373
- word_vis_data_offset = [row for row in word_vis_data_offset if float(row[0]) >= prev_end]
374
- if vis_data_offset:
375
- prev_end = float(vis_data_offset[-1][1])
376
- all_vis_data.extend(vis_data_offset)
377
- all_raw_times_data.extend(raw_times_data_offset)
378
- all_word_vis_data.extend(word_vis_data_offset)
379
- offset += chunk_duration - (30 if i < len(chunk_paths)-1 else 0)
380
- # ファイルの保存
381
- button_updates = save_transcripts(session_dir, audio_name, all_vis_data, all_word_vis_data)
382
- # 一時分割ファイル削除
383
- for p in temp_chunk_paths:
384
- try:
385
- os.remove(p)
386
- except Exception:
387
- pass
388
- return (
389
- all_vis_data,
390
- all_raw_times_data,
391
- all_word_vis_data,
392
- audio_path,
393
- *button_updates
394
- )
395
- else:
396
- # 3時間以内は従来通り
397
- result = transcribe_audio(transcribe_path, model, duration_sec, device)
398
- if not result:
399
- return [], [], [], audio_path, gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False)
400
- vis_data, raw_times_data, word_vis_data = result
401
- button_updates = save_transcripts(session_dir, audio_name, vis_data, word_vis_data)
402
- return (
403
- vis_data,
404
- raw_times_data,
405
- word_vis_data,
406
- audio_path,
407
- *button_updates
408
- )
409
- finally:
410
- if processed_audio_path and os.path.exists(processed_audio_path):
411
- try:
412
- os.remove(processed_audio_path)
413
- print(f"Temporary audio file {processed_audio_path} removed.")
414
- except Exception as e:
415
- print(f"Error removing temporary audio file {processed_audio_path}: {e}")
416
- # 分割ファイルの掃除
417
- for p in temp_chunk_paths:
418
- if os.path.exists(p):
419
- try:
420
- os.remove(p)
421
- except Exception:
422
- pass
423
-
424
- def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
425
- if not isinstance(raw_ts_list, list):
426
- print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
427
- return gr.Audio(value=None, label="Selected Segment")
428
-
429
- if not current_audio_path:
430
- print("No audio path available to play segment from.")
431
- return gr.Audio(value=None, label="Selected Segment")
432
-
433
- selected_index = evt.index[0]
434
-
435
- if selected_index < 0 or selected_index >= len(raw_ts_list):
436
- print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
437
- return gr.Audio(value=None, label="Selected Segment")
438
-
439
- if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
440
- print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
441
- return gr.Audio(value=None, label="Selected Segment")
442
-
443
- start_time_s, end_time_s = raw_ts_list[selected_index]
444
- print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
445
- segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
446
-
447
- if segment_data:
448
- print("Segment data retrieved successfully.")
449
- return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
450
- else:
451
- print("Failed to get audio segment data.")
452
- return gr.Audio(value=None, label="Selected Segment")
453
-
454
- def write_srt(segments, path):
455
- def sec2srt(t):
456
- h, rem = divmod(int(float(t)), 3600)
457
- m, s = divmod(rem, 60)
458
- ms = int((float(t) - int(float(t))) * 1000)
459
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
460
- with open(path, "w", encoding="utf-8") as f:
461
- for i, seg in enumerate(segments, 1):
462
- f.write(f"{i}\n{sec2srt(seg[0])} --> {sec2srt(seg[1])}\n{seg[2]}\n\n")
463
-
464
- def write_vtt(segments, words, path):
465
- def sec2vtt(t):
466
- h, rem = divmod(int(float(t)), 3600)
467
- m, s = divmod(rem, 60)
468
- ms = int((float(t) - int(float(t))) * 1000)
469
- return f"{h:02}:{m:02}:{s:02}.{ms:03}"
470
-
471
- with open(path, "w", encoding="utf-8") as f:
472
- f.write("WEBVTT\n\n")
473
-
474
- word_idx = 0
475
- for seg_idx, seg in enumerate(segments): # segmentにもインデックスが必要な場合に備えてenumerateする
476
- s_start = float(seg[0])
477
- s_end = float(seg[1])
478
- # s_text = seg[2] # s_textはこの関数内では直接VTT出力に使われていない模様
479
-
480
- segment_words = []
481
- temp_word_idx = word_idx # 現在のword_idxから探索を開始
482
- while temp_word_idx < len(words):
483
- w = words[temp_word_idx]
484
- w_start_val = float(w[0])
485
- w_end_val = float(w[1])
486
- # 単語が現在のセグメントに完全に含まれるか、一部でも重なっていれば含める
487
- # ここでは元のロジックを踏襲し、セグメント内に開始・終了がある単語を対象とする
488
- if w_start_val >= s_start and w_end_val <= s_end:
489
- segment_words.append(w)
490
- if temp_word_idx == word_idx: # segment_words に追加された最初の単語なら word_idx を進める
491
- word_idx = temp_word_idx + 1
492
- temp_word_idx += 1
493
- elif w_start_val < s_start and w_end_val > s_start: # 単語がセグメント開始をまたぐ場合
494
- # 必要であれば、このようなケースの単語も segment_words に含める処理を追加
495
- temp_word_idx += 1
496
- elif w_start_val > s_end: # 単語の開始がセグメントの終了より後なら、このセグメントの単語は終わり
497
- break
498
- else: # 上記以外 (単語がセグメントより完全に前など)
499
- if temp_word_idx == word_idx: # word_idx が進まない場合を避ける
500
- word_idx = temp_word_idx + 1
501
- temp_word_idx += 1
502
-
503
- # 各単語ごとにタイムスタンプを生成
504
- for i, word_data in enumerate(segment_words):
505
- w_start = float(word_data[0])
506
- w_end = float(word_data[1])
507
-
508
- # 現在の単語を強調表示し、他の単語は通常表示
509
- colored_text = ""
510
- for j, other_word_data in enumerate(segment_words):
511
- if j == i: # 現在の単語 (i番目) を強調
512
- colored_text += f"<c.yellow><b>{other_word_data[2]}</b></c> "
513
- else:
514
- colored_text += f"{other_word_data[2]} "
515
-
516
- f.write(f"{sec2vtt(w_start)} --> {sec2vtt(w_end)}\n{colored_text.strip()}\n\n")
517
-
518
- def write_json(segments, words, path):
519
- result = {"segments": []}
520
- word_idx = 0
521
- for s in segments:
522
- s_start = float(s[0])
523
- s_end = float(s[1])
524
- s_text = s[2]
525
- word_list = []
526
- while word_idx < len(words):
527
- w = words[word_idx]
528
- w_start = float(w[0])
529
- w_end = float(w[1])
530
- if w_start >= s_start and w_end <= s_end:
531
- word_list.append({"start": w_start, "end": w_end, "word": w[2]})
532
- word_idx += 1
533
- elif w_end < s_start:
534
- word_idx += 1
535
- else:
536
- break
537
- result["segments"].append({
538
- "start": s_start,
539
- "end": s_end,
540
- "text": s_text,
541
- "words": word_list
542
- })
543
- with open(path, "w", encoding="utf-8") as f:
544
- json.dump(result, f, ensure_ascii=False, indent=2)
545
-
546
- def write_lrc(segments, path):
547
- def sec2lrc(t):
548
- m, s = divmod(float(t), 60)
549
- return f"[{int(m):02}:{s:05.2f}]"
550
- with open(path, "w", encoding="utf-8") as f:
551
- for seg in segments:
552
- f.write(f"{sec2lrc(seg[0])}{seg[2]}\n")
553
-
554
- article = (
555
- "<p style='font-size: 1.1em;'>"
556
- "このデモは <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2' target='_blank'>parakeet-tdt-0.6b-v2</a></code> "
557
- "(約6億パラメータ)を用いた高精度な英語音声文字起こしを実演します。"
558
- "</p>"
559
- "<p><strong style='color: red; font-size: 1.2em;'>主な特長:</strong></p>"
560
- "<ul style='font-size: 1.1em;'>" " <li>自動句読点・大文字化</li>"
561
- " <li>単語レベルのタイムスタンプ(下表クリックで該当区間を再生)</li>"
562
- " <li>文字レベルのタイムスタンプ表示にも対応</li>"
563
- " <li>自動チャンク処理による <strong>長時間音声</strong> の効率的な文字起こし(数時間以上の音声にも対応)</li>"
564
- " <li>数字や歌詞など発話の多様なケースに高いロバスト性</li>"
565
- "</ul>"
566
- "<p style='font-size: 1.1em;'>"
567
- "商用・非商用ともに <strong>ライセンス制限なく利用可能</strong> です。"
568
- "</p>"
569
- "<p style='text-align: center;'>"
570
- "<a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2' target='_blank'>🎙️ モデル詳細</a> | "
571
- "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>📄 Fast&nbsp;Conformer 論文</a> | "
572
- "<a href='https://arxiv.org/abs/2304.06795' target='_blank'>📚 TDT 論文</a> | "
573
- "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 NeMo リポジトリ</a>"
574
- "</p>"
575
- )
576
-
577
- examples = [
578
- ["data/example-yt_saTD1u8PorI.mp3"],
579
- ]
580
-
581
- nvidia_theme = gr_themes.Default(
582
- primary_hue=gr_themes.Color(
583
- c50="#E6F1D9", c100="#CEE3B3", c200="#B5D58C", c300="#9CC766",
584
- c400="#84B940", c500="#76B900", c600="#68A600", c700="#5A9200",
585
- c800="#4C7E00", c900="#3E6A00", c950="#2F5600"
586
- ),
587
- neutral_hue="gray",
588
- font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
589
- ).set()
590
-
591
- with gr.Blocks(theme=nvidia_theme) as demo:
592
- model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
593
- gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>長時間対応 音声文字起こし ({model_display_name})</h1>")
594
- gr.HTML(article)
595
-
596
- current_audio_path_state = gr.State(None)
597
- raw_timestamps_list_state = gr.State([])
598
- session_dir_state = gr.State()
599
- demo.load(start_session, outputs=[session_dir_state])
600
-
601
- with gr.Tabs():
602
- with gr.TabItem("Audio File"):
603
- file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
604
- gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
605
- file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
606
-
607
- with gr.TabItem("Microphone"):
608
- mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
609
- mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
610
-
611
- gr.Markdown("---")
612
- gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results</strong></p>")
613
-
614
- download_btn = gr.DownloadButton(label="Download Segment Transcript (CSV)", visible=False)
615
- srt_btn = gr.DownloadButton(label="Download SRT", visible=False)
616
- vtt_btn = gr.DownloadButton(label="Download VTT", visible=False)
617
- json_btn = gr.DownloadButton(label="Download JSON", visible=False)
618
- lrc_btn = gr.DownloadButton(label="Download LRC", visible=False)
619
-
620
- with gr.Tabs():
621
- with gr.TabItem("Segment View (Click row to play segment)"):
622
- vis_timestamps_df = gr.DataFrame(
623
- headers=["Start (s)", "End (s)", "Segment"],
624
- datatype=["number", "number", "str"],
625
- wrap=True,
626
- )
627
- selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
628
-
629
- with gr.TabItem("Word View"):
630
- word_vis_df = gr.DataFrame(
631
- headers=["Start (s)", "End (s)", "Word"],
632
- datatype=["number", "number", "str"],
633
- wrap=False,
634
- )
635
-
636
- mic_transcribe_btn.click(
637
- fn=get_transcripts_and_raw_times,
638
- inputs=[mic_input, session_dir_state],
639
- outputs=[vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn, srt_btn, vtt_btn, json_btn, lrc_btn],
640
- api_name="transcribe_mic"
641
- )
642
-
643
- file_transcribe_btn.click(
644
- fn=get_transcripts_and_raw_times,
645
- inputs=[file_input, session_dir_state],
646
- outputs=[vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn, srt_btn, vtt_btn, json_btn, lrc_btn],
647
- api_name="transcribe_file"
648
- )
649
-
650
- vis_timestamps_df.select(
651
- fn=play_segment,
652
- inputs=[raw_timestamps_list_state, current_audio_path_state],
653
- outputs=[selected_segment_player],
654
- )
655
-
656
- demo.unload(end_session)
657
-
658
- if __name__ == "__main__":
659
- print("Launching Gradio Demo...")
660
- demo.queue(
661
- max_size=5,
662
- default_concurrency_limit=1 # イベントリスナーのデフォルト同時実行数を1に設定
663
- )
664
- demo.launch(
665
- server_name="127.0.0.1",
666
- server_port=7860,
667
- share=False,
668
- max_threads=1 # サーバー全体の同時処理スレッド数を1に設定
669
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_wsl.py CHANGED
@@ -38,33 +38,6 @@ def end_session(request: gr.Request):
38
  # shutil.rmtree(session_dir)
39
  print(f"Session with hash {session_hash} ended.")
40
 
41
- def get_server_files(server_dir: str = None) -> List[str]:
42
- """
43
- サーバー側の指定ディレクトリ内の音声ファイルの一覧を取得する。
44
-
45
- Args:
46
- server_dir (str, optional): 検索するディレクトリ。Noneの場合はデフォルトの場所を使用。
47
-
48
- Returns:
49
- List[str]: 音声ファイルのパスのリスト
50
- """
51
- if server_dir is None:
52
- server_dir = str(Path(__file__).parent / "data")
53
-
54
- audio_extensions = {".mp3", ".wav", ".m4a", ".ogg", ".flac"}
55
- audio_files = []
56
-
57
- try:
58
- for root, _, files in os.walk(server_dir):
59
- for file in files:
60
- if Path(file).suffix.lower() in audio_extensions:
61
- full_path = str(Path(root) / file)
62
- audio_files.append(full_path)
63
- return sorted(audio_files)
64
- except Exception as e:
65
- print(f"Error scanning directory {server_dir}: {e}")
66
- return []
67
-
68
  def get_audio_segment(audio_path, start_second, end_second):
69
  if not audio_path or not Path(audio_path).exists():
70
  print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
@@ -623,22 +596,14 @@ with gr.Blocks(theme=nvidia_theme) as demo:
623
  current_audio_path_state = gr.State(None)
624
  raw_timestamps_list_state = gr.State([])
625
  session_dir_state = gr.State()
626
- demo.load(start_session, outputs=[session_dir_state]) with gr.Tabs():
627
- with gr.TabItem("Upload Audio"):
 
 
628
  file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
629
  gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
630
  file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
631
 
632
- with gr.TabItem("Server Files"):
633
- server_files = get_server_files()
634
- server_file_dropdown = gr.Dropdown(
635
- choices=server_files,
636
- value=server_files[0] if server_files else None,
637
- label="Select Audio File from Server",
638
- type="value"
639
- )
640
- server_file_transcribe_btn = gr.Button("Transcribe Selected File", variant="primary")
641
-
642
  with gr.TabItem("Microphone"):
643
  mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
644
  mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
 
38
  # shutil.rmtree(session_dir)
39
  print(f"Session with hash {session_hash} ended.")
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def get_audio_segment(audio_path, start_second, end_second):
42
  if not audio_path or not Path(audio_path).exists():
43
  print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
 
596
  current_audio_path_state = gr.State(None)
597
  raw_timestamps_list_state = gr.State([])
598
  session_dir_state = gr.State()
599
+ demo.load(start_session, outputs=[session_dir_state])
600
+
601
+ with gr.Tabs():
602
+ with gr.TabItem("Audio File"):
603
  file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
604
  gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
605
  file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
606
 
 
 
 
 
 
 
 
 
 
 
607
  with gr.TabItem("Microphone"):
608
  mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
609
  mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
run_parakeet.bat CHANGED
@@ -1,37 +1,51 @@
1
  @echo off
2
- REM Set codepage to UTF-8 (prevents garbled characters)
3
  chcp 65001
4
 
5
  REM ============================================================================
6
- REM run_parakeet.bat - Parakeet ASR Launcher
7
- REM - Sets up WSL environment
8
- REM - Activates conda environment
9
- REM - Launches the Parakeet ASR application
10
  REM ============================================================================
11
 
12
- echo [Info] Starting Parakeet speech-to-text...
13
  echo.
14
 
15
- REM Move to script directory
16
- pushd "%~dp0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- REM Kill any existing instances
19
- echo [Info] Cleaning up existing processes...
20
- wsl.exe bash -ic "pkill -f 'python.*app_wsl.py'" 2>nul
21
- timeout /t 2 /nobreak > nul
22
 
23
- REM Prepare WSL environment and run the application
24
- echo [Info] Setting up WSL environment...
25
- wsl.exe bash -ic "cd \"$(wslpath -a '%cd%')\" && source ~/miniconda3/etc/profile.d/conda.sh && conda activate parakeet-env && python app_wsl.py"
26
- REM ***** コマンドここまで *****
 
 
 
 
 
27
 
 
28
  if errorlevel 1 (
29
- echo.
30
- echo [Error] アプリケーションの起動に失敗しました。
31
- echo [Error] WSLとconda環境が正しくセットアップされているか確認してください。
32
- echo.
33
- echo 終了するには何かキーを押してください...
34
- pause > nul
35
  ) else (
36
  popd
37
- )
 
 
1
  @echo off
2
+ REM UTF-8コードページ設定
3
  chcp 65001
4
 
5
  REM ============================================================================
6
+ REM Parakeet ASR ディレクトリ処理バージョン
7
+ REM - ディレクトリ選択ダイアログを表示
8
+ REM - WSLパスに変換してPythonスクリプト実行
 
9
  REM ============================================================================
10
 
11
+ echo [Info] Parakeet音声認識を起動します...
12
  echo.
13
 
14
+ REM ディレクトリ選択ダイアログ表示
15
+ set "WSL_DIR="
16
+ for /f "usebackq delims=" %%d in (`powershell -STA -Command ^
17
+ "Add-Type -AssemblyName System.Windows.Forms; ^
18
+ $dialog = New-Object System.Windows.Forms.FolderBrowserDialog; ^
19
+ $dialog.Description = '処理するディレクトリを選択'; ^
20
+ if($dialog.ShowDialog() -eq 'OK'){Write-Output $dialog.SelectedPath}"`) do (
21
+ set "WIN_DIR=%%d"
22
+ )
23
+
24
+ REM ディレクトリ未選択時の処理
25
+ if not defined WIN_DIR (
26
+ echo [Error] ディレクトリが選択されませんでした
27
+ pause
28
+ exit /b 1
29
+ )
30
 
31
+ echo [Info] 選択されたディレクトリ: %WIN_DIR%
 
 
 
32
 
33
+ REM WSL環境での実行
34
+ pushd "%~dp0"
35
+ wsl.exe bash -ic "\
36
+ export WIN_DIR='%WIN_DIR:\=\\%'; \
37
+ target_dir=\$(wslpath -a \"\$WIN_DIR\"); \
38
+ cd \"$(wslpath -a '%cd%')\" && \
39
+ source ~/miniconda3/etc/profile.d/conda.sh && \
40
+ conda activate parakeet-env && \
41
+ python transcribe_cli.py \"\$target_dir\""
42
 
43
+ REM エラーチェック
44
  if errorlevel 1 (
45
+ echo [Error] 処理中にエラーが発生しました
46
+ pause
47
+ exit /b 1
 
 
 
48
  ) else (
49
  popd
50
+ exit /b 0
51
+ )