2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import logging
+import argparse
+
+from pathlib import Path
+from vosk import list_models, list_languages
+from vosk.transcriber.transcriber import Transcriber
+
+parser = argparse.ArgumentParser(
+        description = 'Transcribe audio file and save result in selected format')
+parser.add_argument(
+        '--model', '-m', type=str,
+        help='model path')
+parser.add_argument(
+        '--list-models', default=False, action='store_true', 
+        help='list available models')
+parser.add_argument(
+        '--list-languages', default=False, action='store_true',
+        help='list available languages')
+parser.add_argument(
+        '--model-name', '-n', type=str,
+        help='select model by name')
+parser.add_argument(
+        '--lang', '-l', default='en-us', type=str,
+        help='select model by language')
+parser.add_argument(
+        '--input', '-i', type=str,
+        help='audiofile')
+parser.add_argument(
+        '--output', '-o', default='', type=str,
+        help='optional output filename path')
+parser.add_argument(
+        '--output-type', '-t', default='txt', type=str,
+        help='optional arg output data type')
+parser.add_argument(
+        '--log-level', default='INFO',
+        help='logging level')
+
+def main():
+
+    args = parser.parse_args()
+    log_level = args.log_level.upper()
+    logging.getLogger().setLevel(log_level)
+
+    if args.list_models == True:
+        list_models()
+        return
+
+    if args.list_languages == True:
+        list_languages()
+        return
+
+    if not args.input:
+        logging.info('Please specify input file or directory')
+        exit(1)
+
+    if not Path(args.input).exists():
+        logging.info('File %s does not exist, please specify an existing file/directory' % (args.input))
+        exit(1)
+
+    if args.output !='' and not Path(args.output).exists():
+        logging.info('Output %s does not exist, please specify an existing file' % (args.output))
+        exit(1)
+
+    transcriber = Transcriber(args)
+
+    if Path(args.input).is_dir() and Path(args.output).is_dir():
+        transcriber.process_dir(args)
+        return
+    elif Path(args.input).is_file() and (args.output=='' or Path(args.output).is_file()):
+        transcriber.process_file(args)
+    else:
+        logging.info('Wrong arguments, input and output must be same type')
+        exit(1)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,89 @@
+import json
+import subprocess
+import srt
+import datetime
+import os
+import logging
+
+from pathlib import Path
+from timeit import default_timer as timer
+from vosk import KaldiRecognizer, Model
+
+class Transcriber:
+
+    def __init__(self, args):
+        self.model = Model(model_path=args.model, model_name=args.model_name, lang=args.lang)
+        self.args = args
+
+    def recognize_stream(self, rec, stream):
+        tot_samples = 0
+        result = []
+        while True:
+            data = stream.stdout.read(4000)
+            if len(data) == 0:
+                break
+            if rec.AcceptWaveform(data):
+                tot_samples += len(data)
+                result.append(json.loads(rec.Result())) 
+        result.append(json.loads(rec.FinalResult()))
+        return result, tot_samples
+
+    def format_result(self, result, words_per_line=7):
+        final_result = ''
+        if self.args.output_type == 'srt':
+            subs = []
+            for i, res in enumerate(result):
+                if not 'result' in res:
+                    continue
+                words = res['result']
+                for j in range(0, len(words), words_per_line):
+                    line = words[j : j + words_per_line]
+                    s = srt.Subtitle(index=len(subs),
+                            content = ' '.join([l['word'] for l in line]),
+                            start=datetime.timedelta(seconds=line[0]['start']),
+                            end=datetime.timedelta(seconds=line[-1]['end']))
+                    subs.append(s)
+            final_result = srt.compose(subs)
+        elif self.args.output_type == 'txt':
+            for part in result:
+                final_result += part['text'] + ' '
+        return final_result
+
+
+    def resample_ffmpeg(self, infile):
+        stream = subprocess.Popen(
+            ['ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', 
+            infile, 
+            '-ar', '16000','-ac', '1', '-f', 's16le', '-'], 
+            stdout=subprocess.PIPE)
+        return stream
+
+
+    def process_entry(self, inputdata):
+        logging.info(f'Recognizing {inputdata[0]}')
+
+        rec = KaldiRecognizer(self.model, 16000)
+        rec.SetWords(True)
+
+        stream = self.resample_ffmpeg(inputdata[0])
+        result, tot_samples = self.recognize_stream(rec, stream)
+        final_result = self.format_result(result)
+
+        if inputdata[1] != '':
+            with open(inputdata[1], 'w', encoding='utf-8') as fh:
+                fh.write(final_result)
+        else:
+            print(final_result)
+        return final_result, tot_samples
+
+
+    def process_directory(self,args):
+        task_list = [(Path(args.input, fn), Path(args.output, Path(fn).stem).with_suffix('.' + args.output_type)) for fn in os.listdir(args.input)]
+        with Pool() as pool:
+            pool.map(self.process_entry, file_list)
+
+    def process_file(self, args):
+        start_time = timer()
+        final_result, tot_samples = self.process_entry([args.input, args.output])
+        elapsed = timer() - start_time
+        logging.info(f'''Execution time: {elapsed:.3f} sec; xRT: {format(tot_samples / 16000.0 / float(elapsed), '.3f')}''')