2025-12-01
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import requests
|
||||
from urllib.request import urlretrieve
|
||||
from zipfile import ZipFile
|
||||
from re import match
|
||||
from pathlib import Path
|
||||
from .vosk_cffi import ffi as _ffi
|
||||
from tqdm import tqdm
|
||||
|
||||
# Remote location of the models and local folders
|
||||
MODEL_PRE_URL = 'https://alphacephei.com/vosk/models/'
|
||||
MODEL_LIST_URL = MODEL_PRE_URL + 'model-list.json'
|
||||
MODEL_DIRS = [os.getenv('VOSK_MODEL_PATH'), Path('/usr/share/vosk'), Path.home() / 'AppData/Local/vosk', Path.home() / '.cache/vosk']
|
||||
|
||||
def open_dll():
|
||||
dlldir = os.path.abspath(os.path.dirname(__file__))
|
||||
if sys.platform == 'win32':
|
||||
# We want to load dependencies too
|
||||
os.environ["PATH"] = dlldir + os.pathsep + os.environ['PATH']
|
||||
if hasattr(os, 'add_dll_directory'):
|
||||
os.add_dll_directory(dlldir)
|
||||
return _ffi.dlopen(os.path.join(dlldir, "libvosk.dll"))
|
||||
elif sys.platform == 'linux':
|
||||
return _ffi.dlopen(os.path.join(dlldir, "libvosk.so"))
|
||||
elif sys.platform == 'darwin':
|
||||
return _ffi.dlopen(os.path.join(dlldir, "libvosk.dyld"))
|
||||
else:
|
||||
raise TypeError("Unsupported platform")
|
||||
|
||||
_c = open_dll()
|
||||
|
||||
def list_models():
|
||||
response = requests.get(MODEL_LIST_URL)
|
||||
for model in response.json():
|
||||
print(model['name'])
|
||||
|
||||
def list_languages():
|
||||
response = requests.get(MODEL_LIST_URL)
|
||||
languages = set([m['lang'] for m in response.json()])
|
||||
for lang in languages:
|
||||
print (lang)
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, model_path=None, model_name=None, lang=None):
|
||||
if model_path != None:
|
||||
self._handle = _c.vosk_model_new(model_path.encode('utf-8'))
|
||||
else:
|
||||
model_path = self.get_model_path(model_name, lang)
|
||||
self._handle = _c.vosk_model_new(model_path.encode('utf-8'))
|
||||
if self._handle == _ffi.NULL:
|
||||
raise Exception("Failed to create a model")
|
||||
|
||||
def __del__(self):
|
||||
_c.vosk_model_free(self._handle)
|
||||
|
||||
def vosk_model_find_word(self, word):
|
||||
return _c.vosk_model_find_word(self._handle, word.encode('utf-8'))
|
||||
|
||||
def get_model_path(self, model_name, lang):
|
||||
if model_name is None:
|
||||
model_path = self.get_model_by_lang(lang)
|
||||
else:
|
||||
model_path = self.get_model_by_name(model_name)
|
||||
return str(model_path)
|
||||
|
||||
def get_model_by_name(self, model_name):
|
||||
for directory in MODEL_DIRS:
|
||||
if directory is None or not Path(directory).exists():
|
||||
continue
|
||||
model_file_list = os.listdir(directory)
|
||||
model_file = [model for model in model_file_list if model == model_name]
|
||||
if model_file != []:
|
||||
return Path(directory, model_file[0])
|
||||
response = requests.get(MODEL_LIST_URL)
|
||||
result_model = [model['name'] for model in response.json() if model['name'] == model_name]
|
||||
if result_model == []:
|
||||
raise Exception("model name %s does not exist" % (model_name))
|
||||
else:
|
||||
self.download_model(Path(directory, result_model[0]))
|
||||
return Path(directory, result_model[0])
|
||||
|
||||
def get_model_by_lang(self, lang):
|
||||
for directory in MODEL_DIRS:
|
||||
if directory is None or not Path(directory).exists():
|
||||
continue
|
||||
model_file_list = os.listdir(directory)
|
||||
model_file = [model for model in model_file_list if match(f"vosk-model(-small)?-{lang}", model)]
|
||||
if model_file != []:
|
||||
return Path(directory, model_file[0])
|
||||
response = requests.get(MODEL_LIST_URL)
|
||||
result_model = [model['name'] for model in response.json() if model['lang'] == lang and model['type'] == 'small' and model['obsolete'] == 'false']
|
||||
if result_model == []:
|
||||
raise Exception("lang %s does not exist" % (lang))
|
||||
else:
|
||||
self.download_model(Path(directory, result_model[0]))
|
||||
return Path(directory, result_model[0])
|
||||
|
||||
def download_model(self, model_name):
|
||||
if not MODEL_DIRS[3].exists():
|
||||
MODEL_DIRS[3].mkdir()
|
||||
with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
|
||||
desc=(MODEL_PRE_URL + str(model_name.name) + '.zip').split('/')[-1]) as t:
|
||||
reporthook = self.download_progress_hook(t)
|
||||
urlretrieve(MODEL_PRE_URL + str(model_name.name) + '.zip', str(model_name) + '.zip',
|
||||
reporthook=reporthook, data=None)
|
||||
t.total = t.n
|
||||
with ZipFile(str(model_name) + '.zip', 'r') as model_ref:
|
||||
model_ref.extractall(model_name.parent)
|
||||
Path(str(model_name) + '.zip').unlink()
|
||||
|
||||
def download_progress_hook(self, t):
|
||||
last_b = [0]
|
||||
def update_to(b=1, bsize=1, tsize=None):
|
||||
if tsize not in (None, -1):
|
||||
t.total = tsize
|
||||
displayed = t.update((b - last_b[0]) * bsize)
|
||||
last_b[0] = b
|
||||
return displayed
|
||||
return update_to
|
||||
|
||||
class SpkModel(object):
|
||||
|
||||
def __init__(self, model_path):
|
||||
self._handle = _c.vosk_spk_model_new(model_path.encode('utf-8'))
|
||||
|
||||
if self._handle == _ffi.NULL:
|
||||
raise Exception("Failed to create a speaker model")
|
||||
|
||||
def __del__(self):
|
||||
_c.vosk_spk_model_free(self._handle)
|
||||
|
||||
class KaldiRecognizer(object):
|
||||
|
||||
def __init__(self, *args):
|
||||
if len(args) == 2:
|
||||
self._handle = _c.vosk_recognizer_new(args[0]._handle, args[1])
|
||||
elif len(args) == 3 and type(args[2]) is SpkModel:
|
||||
self._handle = _c.vosk_recognizer_new_spk(args[0]._handle, args[1], args[2]._handle)
|
||||
elif len(args) == 3 and type(args[2]) is str:
|
||||
self._handle = _c.vosk_recognizer_new_grm(args[0]._handle, args[1], args[2].encode('utf-8'))
|
||||
else:
|
||||
raise TypeError("Unknown arguments")
|
||||
|
||||
if self._handle == _ffi.NULL:
|
||||
raise Exception("Failed to create a recognizer")
|
||||
|
||||
def __del__(self):
|
||||
_c.vosk_recognizer_free(self._handle)
|
||||
|
||||
def SetMaxAlternatives(self, max_alternatives):
|
||||
_c.vosk_recognizer_set_max_alternatives(self._handle, max_alternatives)
|
||||
|
||||
def SetWords(self, enable_words):
|
||||
_c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
|
||||
|
||||
def SetPartialWords(self, enable_partial_words):
|
||||
_c.vosk_recognizer_set_partial_words(self._handle, 1 if enable_partial_words else 0)
|
||||
|
||||
def SetNLSML(self, enable_nlsml):
|
||||
_c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
|
||||
|
||||
def SetSpkModel(self, spk_model):
|
||||
_c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
|
||||
|
||||
def AcceptWaveform(self, data):
|
||||
res = _c.vosk_recognizer_accept_waveform(self._handle, data, len(data))
|
||||
if res < 0:
|
||||
raise Exception("Failed to process waveform")
|
||||
return res
|
||||
|
||||
def Result(self):
|
||||
return _ffi.string(_c.vosk_recognizer_result(self._handle)).decode('utf-8')
|
||||
|
||||
def PartialResult(self):
|
||||
return _ffi.string(_c.vosk_recognizer_partial_result(self._handle)).decode('utf-8')
|
||||
|
||||
def FinalResult(self):
|
||||
return _ffi.string(_c.vosk_recognizer_final_result(self._handle)).decode('utf-8')
|
||||
|
||||
def Reset(self):
|
||||
return _c.vosk_recognizer_reset(self._handle)
|
||||
|
||||
|
||||
def SetLogLevel(level):
|
||||
return _c.vosk_set_log_level(level)
|
||||
|
||||
|
||||
def GpuInit():
|
||||
_c.vosk_gpu_init()
|
||||
|
||||
|
||||
def GpuThreadInit():
|
||||
_c.vosk_gpu_thread_init()
|
||||
|
||||
class BatchModel(object):
|
||||
|
||||
def __init__(self, *args):
|
||||
self._handle = _c.vosk_batch_model_new()
|
||||
|
||||
if self._handle == _ffi.NULL:
|
||||
raise Exception("Failed to create a model")
|
||||
|
||||
def __del__(self):
|
||||
_c.vosk_batch_model_free(self._handle)
|
||||
|
||||
def Wait(self):
|
||||
_c.vosk_batch_model_wait(self._handle)
|
||||
|
||||
class BatchRecognizer(object):
|
||||
|
||||
def __init__(self, *args):
|
||||
self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
|
||||
|
||||
if self._handle == _ffi.NULL:
|
||||
raise Exception("Failed to create a recognizer")
|
||||
|
||||
def __del__(self):
|
||||
_c.vosk_batch_recognizer_free(self._handle)
|
||||
|
||||
def AcceptWaveform(self, data):
|
||||
res = _c.vosk_batch_recognizer_accept_waveform(self._handle, data, len(data))
|
||||
|
||||
def Result(self):
|
||||
ptr = _c.vosk_batch_recognizer_front_result(self._handle)
|
||||
res = _ffi.string(ptr).decode('utf-8')
|
||||
_c.vosk_batch_recognizer_pop(self._handle)
|
||||
return res
|
||||
|
||||
def FinishStream(self):
|
||||
_c.vosk_batch_recognizer_finish_stream(self._handle)
|
||||
|
||||
def GetPendingChunks(self):
|
||||
return _c.vosk_batch_recognizer_get_pending_chunks(self._handle)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
from vosk import list_models, list_languages
|
||||
from vosk.transcriber.transcriber import Transcriber
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description = 'Transcribe audio file and save result in selected format')
|
||||
parser.add_argument(
|
||||
'--model', '-m', type=str,
|
||||
help='model path')
|
||||
parser.add_argument(
|
||||
'--list-models', default=False, action='store_true',
|
||||
help='list available models')
|
||||
parser.add_argument(
|
||||
'--list-languages', default=False, action='store_true',
|
||||
help='list available languages')
|
||||
parser.add_argument(
|
||||
'--model-name', '-n', type=str,
|
||||
help='select model by name')
|
||||
parser.add_argument(
|
||||
'--lang', '-l', default='en-us', type=str,
|
||||
help='select model by language')
|
||||
parser.add_argument(
|
||||
'--input', '-i', type=str,
|
||||
help='audiofile')
|
||||
parser.add_argument(
|
||||
'--output', '-o', default='', type=str,
|
||||
help='optional output filename path')
|
||||
parser.add_argument(
|
||||
'--output-type', '-t', default='txt', type=str,
|
||||
help='optional arg output data type')
|
||||
parser.add_argument(
|
||||
'--log-level', default='INFO',
|
||||
help='logging level')
|
||||
|
||||
def main():
|
||||
|
||||
args = parser.parse_args()
|
||||
log_level = args.log_level.upper()
|
||||
logging.getLogger().setLevel(log_level)
|
||||
|
||||
if args.list_models == True:
|
||||
list_models()
|
||||
return
|
||||
|
||||
if args.list_languages == True:
|
||||
list_languages()
|
||||
return
|
||||
|
||||
if not args.input:
|
||||
logging.info('Please specify input file or directory')
|
||||
exit(1)
|
||||
|
||||
if not Path(args.input).exists():
|
||||
logging.info('File %s does not exist, please specify an existing file/directory' % (args.input))
|
||||
exit(1)
|
||||
|
||||
if args.output !='' and not Path(args.output).exists():
|
||||
logging.info('Output %s does not exist, please specify an existing file' % (args.output))
|
||||
exit(1)
|
||||
|
||||
transcriber = Transcriber(args)
|
||||
|
||||
if Path(args.input).is_dir() and Path(args.output).is_dir():
|
||||
transcriber.process_dir(args)
|
||||
return
|
||||
elif Path(args.input).is_file() and (args.output=='' or Path(args.output).is_file()):
|
||||
transcriber.process_file(args)
|
||||
else:
|
||||
logging.info('Wrong arguments, input and output must be same type')
|
||||
exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,89 @@
|
||||
import json
|
||||
import subprocess
|
||||
import srt
|
||||
import datetime
|
||||
import os
|
||||
import logging
|
||||
|
||||
from pathlib import Path
|
||||
from timeit import default_timer as timer
|
||||
from vosk import KaldiRecognizer, Model
|
||||
|
||||
class Transcriber:
|
||||
|
||||
def __init__(self, args):
|
||||
self.model = Model(model_path=args.model, model_name=args.model_name, lang=args.lang)
|
||||
self.args = args
|
||||
|
||||
def recognize_stream(self, rec, stream):
|
||||
tot_samples = 0
|
||||
result = []
|
||||
while True:
|
||||
data = stream.stdout.read(4000)
|
||||
if len(data) == 0:
|
||||
break
|
||||
if rec.AcceptWaveform(data):
|
||||
tot_samples += len(data)
|
||||
result.append(json.loads(rec.Result()))
|
||||
result.append(json.loads(rec.FinalResult()))
|
||||
return result, tot_samples
|
||||
|
||||
def format_result(self, result, words_per_line=7):
|
||||
final_result = ''
|
||||
if self.args.output_type == 'srt':
|
||||
subs = []
|
||||
for i, res in enumerate(result):
|
||||
if not 'result' in res:
|
||||
continue
|
||||
words = res['result']
|
||||
for j in range(0, len(words), words_per_line):
|
||||
line = words[j : j + words_per_line]
|
||||
s = srt.Subtitle(index=len(subs),
|
||||
content = ' '.join([l['word'] for l in line]),
|
||||
start=datetime.timedelta(seconds=line[0]['start']),
|
||||
end=datetime.timedelta(seconds=line[-1]['end']))
|
||||
subs.append(s)
|
||||
final_result = srt.compose(subs)
|
||||
elif self.args.output_type == 'txt':
|
||||
for part in result:
|
||||
final_result += part['text'] + ' '
|
||||
return final_result
|
||||
|
||||
|
||||
def resample_ffmpeg(self, infile):
|
||||
stream = subprocess.Popen(
|
||||
['ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i',
|
||||
infile,
|
||||
'-ar', '16000','-ac', '1', '-f', 's16le', '-'],
|
||||
stdout=subprocess.PIPE)
|
||||
return stream
|
||||
|
||||
|
||||
def process_entry(self, inputdata):
|
||||
logging.info(f'Recognizing {inputdata[0]}')
|
||||
|
||||
rec = KaldiRecognizer(self.model, 16000)
|
||||
rec.SetWords(True)
|
||||
|
||||
stream = self.resample_ffmpeg(inputdata[0])
|
||||
result, tot_samples = self.recognize_stream(rec, stream)
|
||||
final_result = self.format_result(result)
|
||||
|
||||
if inputdata[1] != '':
|
||||
with open(inputdata[1], 'w', encoding='utf-8') as fh:
|
||||
fh.write(final_result)
|
||||
else:
|
||||
print(final_result)
|
||||
return final_result, tot_samples
|
||||
|
||||
|
||||
def process_directory(self,args):
|
||||
task_list = [(Path(args.input, fn), Path(args.output, Path(fn).stem).with_suffix('.' + args.output_type)) for fn in os.listdir(args.input)]
|
||||
with Pool() as pool:
|
||||
pool.map(self.process_entry, file_list)
|
||||
|
||||
def process_file(self, args):
|
||||
start_time = timer()
|
||||
final_result, tot_samples = self.process_entry([args.input, args.output])
|
||||
elapsed = timer() - start_time
|
||||
logging.info(f'''Execution time: {elapsed:.3f} sec; xRT: {format(tot_samples / 16000.0 / float(elapsed), '.3f')}''')
|
||||
@@ -0,0 +1,10 @@
|
||||
# auto-generated file
|
||||
import _cffi_backend
|
||||
|
||||
ffi = _cffi_backend.FFI('vosk.vosk_cffi',
|
||||
_version = 0x2601,
|
||||
_types = b'\x00\x00\x03\x0D\x00\x00\x00\x0F\x00\x00\x1B\x0D\x00\x00\x5B\x03\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x0A\x0D\x00\x00\x60\x03\x00\x00\x00\x0F\x00\x00\x1E\x0D\x00\x00\x5D\x03\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x1E\x0D\x00\x00\x0A\x11\x00\x00\x0D\x01\x00\x00\x5F\x03\x00\x00\x00\x0F\x00\x00\x1E\x0D\x00\x00\x0A\x11\x00\x00\x0D\x01\x00\x00\x07\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x07\x11\x00\x00\x00\x0F\x00\x00\x07\x0D\x00\x00\x5C\x03\x00\x00\x00\x0F\x00\x00\x07\x0D\x00\x00\x5E\x03\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1B\x11\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x0A\x11\x00\x00\x07\x11\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1E\x11\x00\x00\x07\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1E\x11\x00\x00\x04\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1E\x11\x00\x00\x61\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x03\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1B\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1B\x11\x00\x00\x07\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x0A\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1E\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1E\x11\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x00\x0F\x00\x00\x00\x09\x00\x00\x01\x09\x00\x00\x02\x09\x00\x00\x03\x09\x00\x00\x04\x09\x00\x00\x02\x01\x00\x00\x05\x01\x00\x00\x00\x01',
|
||||
_globals = (b'\x00\x00\x36\x23vosk_batch_model_free',0,b'\x00\x00\x00\x23vosk_batch_model_new',0,b'\x00\x00\x36\x23vosk_batch_model_wait',0,b'\x00\x00\x3C\x23vosk_batch_recognizer_accept_waveform',0,b'\x00\x00\x39\x23vosk_batch_recognizer_finish_stream',0,b'\x00\x00\x39\x23vosk_batch_recognizer_free',0,b'\x00\x00\x1A\x23vosk_batch_recognizer_front_result',0,b'\x00\x00\x20\x23vosk_batch_recognizer_get_pending_chunks',0,b'\x00\x00\x02\x23vosk_batch_recognizer_new',0,b'\x00\x00\x39\x23vosk_batch_recognizer_pop',0,b'\x00\x00\x41\x23vosk_batch_recognizer_set_nlsml',0,b'\x00\x00\x59\x23vosk_gpu_init',0,b'\x00\x00\x59\x23vosk_gpu_thread_init',0,b'\x00\x00\x23\x23vosk_model_find_word',0,b'\x00\x00\x45\x23vosk_model_free',0,b'\x00\x00\x06\x23vosk_model_new',0,b'\x00\x00\x27\x23vosk_recognizer_accept_waveform',0,b'\x00\x00\x2C\x23vosk_recognizer_accept_waveform_f',0,b'\x00\x00\x31\x23vosk_recognizer_accept_waveform_s',0,b'\x00\x00\x1D\x23vosk_recognizer_final_result',0,b'\x00\x00\x48\x23vosk_recognizer_free',0,b'\x00\x00\x09\x23vosk_recognizer_new',0,b'\x00\x00\x12\x23vosk_recognizer_new_grm',0,b'\x00\x00\x0D\x23vosk_recognizer_new_spk',0,b'\x00\x00\x1D\x23vosk_recognizer_partial_result',0,b'\x00\x00\x48\x23vosk_recognizer_reset',0,b'\x00\x00\x1D\x23vosk_recognizer_result',0,b'\x00\x00\x4F\x23vosk_recognizer_set_max_alternatives',0,b'\x00\x00\x4F\x23vosk_recognizer_set_nlsml',0,b'\x00\x00\x4F\x23vosk_recognizer_set_partial_words',0,b'\x00\x00\x4B\x23vosk_recognizer_set_spk_model',0,b'\x00\x00\x4F\x23vosk_recognizer_set_words',0,b'\x00\x00\x56\x23vosk_set_log_level',0,b'\x00\x00\x53\x23vosk_spk_model_free',0,b'\x00\x00\x17\x23vosk_spk_model_new',0),
|
||||
_struct_unions = ((b'\x00\x00\x00\x5B\x00\x00\x00\x10VoskBatchModel',),(b'\x00\x00\x00\x5C\x00\x00\x00\x10VoskBatchRecognizer',),(b'\x00\x00\x00\x5D\x00\x00\x00\x10VoskModel',),(b'\x00\x00\x00\x5E\x00\x00\x00\x10VoskRecognizer',),(b'\x00\x00\x00\x5F\x00\x00\x00\x10VoskSpkModel',)),
|
||||
_typenames = (b'\x00\x00\x00\x5BVoskBatchModel',b'\x00\x00\x00\x5CVoskBatchRecognizer',b'\x00\x00\x00\x5DVoskModel',b'\x00\x00\x00\x5EVoskRecognizer',b'\x00\x00\x00\x5FVoskSpkModel'),
|
||||
)
|
||||
Reference in New Issue
Block a user