2025-12-01
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
from vosk import list_models, list_languages
|
||||
from vosk.transcriber.transcriber import Transcriber
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description = 'Transcribe audio file and save result in selected format')
|
||||
parser.add_argument(
|
||||
'--model', '-m', type=str,
|
||||
help='model path')
|
||||
parser.add_argument(
|
||||
'--list-models', default=False, action='store_true',
|
||||
help='list available models')
|
||||
parser.add_argument(
|
||||
'--list-languages', default=False, action='store_true',
|
||||
help='list available languages')
|
||||
parser.add_argument(
|
||||
'--model-name', '-n', type=str,
|
||||
help='select model by name')
|
||||
parser.add_argument(
|
||||
'--lang', '-l', default='en-us', type=str,
|
||||
help='select model by language')
|
||||
parser.add_argument(
|
||||
'--input', '-i', type=str,
|
||||
help='audiofile')
|
||||
parser.add_argument(
|
||||
'--output', '-o', default='', type=str,
|
||||
help='optional output filename path')
|
||||
parser.add_argument(
|
||||
'--output-type', '-t', default='txt', type=str,
|
||||
help='optional arg output data type')
|
||||
parser.add_argument(
|
||||
'--log-level', default='INFO',
|
||||
help='logging level')
|
||||
|
||||
def main():
|
||||
|
||||
args = parser.parse_args()
|
||||
log_level = args.log_level.upper()
|
||||
logging.getLogger().setLevel(log_level)
|
||||
|
||||
if args.list_models == True:
|
||||
list_models()
|
||||
return
|
||||
|
||||
if args.list_languages == True:
|
||||
list_languages()
|
||||
return
|
||||
|
||||
if not args.input:
|
||||
logging.info('Please specify input file or directory')
|
||||
exit(1)
|
||||
|
||||
if not Path(args.input).exists():
|
||||
logging.info('File %s does not exist, please specify an existing file/directory' % (args.input))
|
||||
exit(1)
|
||||
|
||||
if args.output !='' and not Path(args.output).exists():
|
||||
logging.info('Output %s does not exist, please specify an existing file' % (args.output))
|
||||
exit(1)
|
||||
|
||||
transcriber = Transcriber(args)
|
||||
|
||||
if Path(args.input).is_dir() and Path(args.output).is_dir():
|
||||
transcriber.process_dir(args)
|
||||
return
|
||||
elif Path(args.input).is_file() and (args.output=='' or Path(args.output).is_file()):
|
||||
transcriber.process_file(args)
|
||||
else:
|
||||
logging.info('Wrong arguments, input and output must be same type')
|
||||
exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,89 @@
|
||||
import json
|
||||
import subprocess
|
||||
import srt
|
||||
import datetime
|
||||
import os
|
||||
import logging
|
||||
|
||||
from pathlib import Path
|
||||
from timeit import default_timer as timer
|
||||
from vosk import KaldiRecognizer, Model
|
||||
|
||||
class Transcriber:
|
||||
|
||||
def __init__(self, args):
|
||||
self.model = Model(model_path=args.model, model_name=args.model_name, lang=args.lang)
|
||||
self.args = args
|
||||
|
||||
def recognize_stream(self, rec, stream):
|
||||
tot_samples = 0
|
||||
result = []
|
||||
while True:
|
||||
data = stream.stdout.read(4000)
|
||||
if len(data) == 0:
|
||||
break
|
||||
if rec.AcceptWaveform(data):
|
||||
tot_samples += len(data)
|
||||
result.append(json.loads(rec.Result()))
|
||||
result.append(json.loads(rec.FinalResult()))
|
||||
return result, tot_samples
|
||||
|
||||
def format_result(self, result, words_per_line=7):
|
||||
final_result = ''
|
||||
if self.args.output_type == 'srt':
|
||||
subs = []
|
||||
for i, res in enumerate(result):
|
||||
if not 'result' in res:
|
||||
continue
|
||||
words = res['result']
|
||||
for j in range(0, len(words), words_per_line):
|
||||
line = words[j : j + words_per_line]
|
||||
s = srt.Subtitle(index=len(subs),
|
||||
content = ' '.join([l['word'] for l in line]),
|
||||
start=datetime.timedelta(seconds=line[0]['start']),
|
||||
end=datetime.timedelta(seconds=line[-1]['end']))
|
||||
subs.append(s)
|
||||
final_result = srt.compose(subs)
|
||||
elif self.args.output_type == 'txt':
|
||||
for part in result:
|
||||
final_result += part['text'] + ' '
|
||||
return final_result
|
||||
|
||||
|
||||
def resample_ffmpeg(self, infile):
|
||||
stream = subprocess.Popen(
|
||||
['ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i',
|
||||
infile,
|
||||
'-ar', '16000','-ac', '1', '-f', 's16le', '-'],
|
||||
stdout=subprocess.PIPE)
|
||||
return stream
|
||||
|
||||
|
||||
def process_entry(self, inputdata):
|
||||
logging.info(f'Recognizing {inputdata[0]}')
|
||||
|
||||
rec = KaldiRecognizer(self.model, 16000)
|
||||
rec.SetWords(True)
|
||||
|
||||
stream = self.resample_ffmpeg(inputdata[0])
|
||||
result, tot_samples = self.recognize_stream(rec, stream)
|
||||
final_result = self.format_result(result)
|
||||
|
||||
if inputdata[1] != '':
|
||||
with open(inputdata[1], 'w', encoding='utf-8') as fh:
|
||||
fh.write(final_result)
|
||||
else:
|
||||
print(final_result)
|
||||
return final_result, tot_samples
|
||||
|
||||
|
||||
def process_directory(self,args):
|
||||
task_list = [(Path(args.input, fn), Path(args.output, Path(fn).stem).with_suffix('.' + args.output_type)) for fn in os.listdir(args.input)]
|
||||
with Pool() as pool:
|
||||
pool.map(self.process_entry, file_list)
|
||||
|
||||
def process_file(self, args):
|
||||
start_time = timer()
|
||||
final_result, tot_samples = self.process_entry([args.input, args.output])
|
||||
elapsed = timer() - start_time
|
||||
logging.info(f'''Execution time: {elapsed:.3f} sec; xRT: {format(tot_samples / 16000.0 / float(elapsed), '.3f')}''')
|
||||
Reference in New Issue
Block a user