2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,235 @@
import os
import sys
import requests
from urllib.request import urlretrieve
from zipfile import ZipFile
from re import match
from pathlib import Path
from .vosk_cffi import ffi as _ffi
from tqdm import tqdm
# Remote location of the models and local folders
MODEL_PRE_URL = 'https://alphacephei.com/vosk/models/'
MODEL_LIST_URL = MODEL_PRE_URL + 'model-list.json'
MODEL_DIRS = [os.getenv('VOSK_MODEL_PATH'), Path('/usr/share/vosk'), Path.home() / 'AppData/Local/vosk', Path.home() / '.cache/vosk']
def open_dll():
dlldir = os.path.abspath(os.path.dirname(__file__))
if sys.platform == 'win32':
# We want to load dependencies too
os.environ["PATH"] = dlldir + os.pathsep + os.environ['PATH']
if hasattr(os, 'add_dll_directory'):
os.add_dll_directory(dlldir)
return _ffi.dlopen(os.path.join(dlldir, "libvosk.dll"))
elif sys.platform == 'linux':
return _ffi.dlopen(os.path.join(dlldir, "libvosk.so"))
elif sys.platform == 'darwin':
return _ffi.dlopen(os.path.join(dlldir, "libvosk.dyld"))
else:
raise TypeError("Unsupported platform")
_c = open_dll()
def list_models():
response = requests.get(MODEL_LIST_URL)
for model in response.json():
print(model['name'])
def list_languages():
response = requests.get(MODEL_LIST_URL)
languages = set([m['lang'] for m in response.json()])
for lang in languages:
print (lang)
class Model(object):
def __init__(self, model_path=None, model_name=None, lang=None):
if model_path != None:
self._handle = _c.vosk_model_new(model_path.encode('utf-8'))
else:
model_path = self.get_model_path(model_name, lang)
self._handle = _c.vosk_model_new(model_path.encode('utf-8'))
if self._handle == _ffi.NULL:
raise Exception("Failed to create a model")
def __del__(self):
_c.vosk_model_free(self._handle)
def vosk_model_find_word(self, word):
return _c.vosk_model_find_word(self._handle, word.encode('utf-8'))
def get_model_path(self, model_name, lang):
if model_name is None:
model_path = self.get_model_by_lang(lang)
else:
model_path = self.get_model_by_name(model_name)
return str(model_path)
def get_model_by_name(self, model_name):
for directory in MODEL_DIRS:
if directory is None or not Path(directory).exists():
continue
model_file_list = os.listdir(directory)
model_file = [model for model in model_file_list if model == model_name]
if model_file != []:
return Path(directory, model_file[0])
response = requests.get(MODEL_LIST_URL)
result_model = [model['name'] for model in response.json() if model['name'] == model_name]
if result_model == []:
raise Exception("model name %s does not exist" % (model_name))
else:
self.download_model(Path(directory, result_model[0]))
return Path(directory, result_model[0])
def get_model_by_lang(self, lang):
for directory in MODEL_DIRS:
if directory is None or not Path(directory).exists():
continue
model_file_list = os.listdir(directory)
model_file = [model for model in model_file_list if match(f"vosk-model(-small)?-{lang}", model)]
if model_file != []:
return Path(directory, model_file[0])
response = requests.get(MODEL_LIST_URL)
result_model = [model['name'] for model in response.json() if model['lang'] == lang and model['type'] == 'small' and model['obsolete'] == 'false']
if result_model == []:
raise Exception("lang %s does not exist" % (lang))
else:
self.download_model(Path(directory, result_model[0]))
return Path(directory, result_model[0])
def download_model(self, model_name):
if not MODEL_DIRS[3].exists():
MODEL_DIRS[3].mkdir()
with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
desc=(MODEL_PRE_URL + str(model_name.name) + '.zip').split('/')[-1]) as t:
reporthook = self.download_progress_hook(t)
urlretrieve(MODEL_PRE_URL + str(model_name.name) + '.zip', str(model_name) + '.zip',
reporthook=reporthook, data=None)
t.total = t.n
with ZipFile(str(model_name) + '.zip', 'r') as model_ref:
model_ref.extractall(model_name.parent)
Path(str(model_name) + '.zip').unlink()
def download_progress_hook(self, t):
last_b = [0]
def update_to(b=1, bsize=1, tsize=None):
if tsize not in (None, -1):
t.total = tsize
displayed = t.update((b - last_b[0]) * bsize)
last_b[0] = b
return displayed
return update_to
class SpkModel(object):
def __init__(self, model_path):
self._handle = _c.vosk_spk_model_new(model_path.encode('utf-8'))
if self._handle == _ffi.NULL:
raise Exception("Failed to create a speaker model")
def __del__(self):
_c.vosk_spk_model_free(self._handle)
class KaldiRecognizer(object):
def __init__(self, *args):
if len(args) == 2:
self._handle = _c.vosk_recognizer_new(args[0]._handle, args[1])
elif len(args) == 3 and type(args[2]) is SpkModel:
self._handle = _c.vosk_recognizer_new_spk(args[0]._handle, args[1], args[2]._handle)
elif len(args) == 3 and type(args[2]) is str:
self._handle = _c.vosk_recognizer_new_grm(args[0]._handle, args[1], args[2].encode('utf-8'))
else:
raise TypeError("Unknown arguments")
if self._handle == _ffi.NULL:
raise Exception("Failed to create a recognizer")
def __del__(self):
_c.vosk_recognizer_free(self._handle)
def SetMaxAlternatives(self, max_alternatives):
_c.vosk_recognizer_set_max_alternatives(self._handle, max_alternatives)
def SetWords(self, enable_words):
_c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
def SetPartialWords(self, enable_partial_words):
_c.vosk_recognizer_set_partial_words(self._handle, 1 if enable_partial_words else 0)
def SetNLSML(self, enable_nlsml):
_c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
def SetSpkModel(self, spk_model):
_c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
def AcceptWaveform(self, data):
res = _c.vosk_recognizer_accept_waveform(self._handle, data, len(data))
if res < 0:
raise Exception("Failed to process waveform")
return res
def Result(self):
return _ffi.string(_c.vosk_recognizer_result(self._handle)).decode('utf-8')
def PartialResult(self):
return _ffi.string(_c.vosk_recognizer_partial_result(self._handle)).decode('utf-8')
def FinalResult(self):
return _ffi.string(_c.vosk_recognizer_final_result(self._handle)).decode('utf-8')
def Reset(self):
return _c.vosk_recognizer_reset(self._handle)
def SetLogLevel(level):
return _c.vosk_set_log_level(level)
def GpuInit():
_c.vosk_gpu_init()
def GpuThreadInit():
_c.vosk_gpu_thread_init()
class BatchModel(object):
def __init__(self, *args):
self._handle = _c.vosk_batch_model_new()
if self._handle == _ffi.NULL:
raise Exception("Failed to create a model")
def __del__(self):
_c.vosk_batch_model_free(self._handle)
def Wait(self):
_c.vosk_batch_model_wait(self._handle)
class BatchRecognizer(object):
def __init__(self, *args):
self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
if self._handle == _ffi.NULL:
raise Exception("Failed to create a recognizer")
def __del__(self):
_c.vosk_batch_recognizer_free(self._handle)
def AcceptWaveform(self, data):
res = _c.vosk_batch_recognizer_accept_waveform(self._handle, data, len(data))
def Result(self):
ptr = _c.vosk_batch_recognizer_front_result(self._handle)
res = _ffi.string(ptr).decode('utf-8')
_c.vosk_batch_recognizer_pop(self._handle)
return res
def FinishStream(self):
_c.vosk_batch_recognizer_finish_stream(self._handle)
def GetPendingChunks(self):
return _c.vosk_batch_recognizer_get_pending_chunks(self._handle)
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
import logging
import argparse
from pathlib import Path
from vosk import list_models, list_languages
from vosk.transcriber.transcriber import Transcriber
parser = argparse.ArgumentParser(
description = 'Transcribe audio file and save result in selected format')
parser.add_argument(
'--model', '-m', type=str,
help='model path')
parser.add_argument(
'--list-models', default=False, action='store_true',
help='list available models')
parser.add_argument(
'--list-languages', default=False, action='store_true',
help='list available languages')
parser.add_argument(
'--model-name', '-n', type=str,
help='select model by name')
parser.add_argument(
'--lang', '-l', default='en-us', type=str,
help='select model by language')
parser.add_argument(
'--input', '-i', type=str,
help='audiofile')
parser.add_argument(
'--output', '-o', default='', type=str,
help='optional output filename path')
parser.add_argument(
'--output-type', '-t', default='txt', type=str,
help='optional arg output data type')
parser.add_argument(
'--log-level', default='INFO',
help='logging level')
def main():
args = parser.parse_args()
log_level = args.log_level.upper()
logging.getLogger().setLevel(log_level)
if args.list_models == True:
list_models()
return
if args.list_languages == True:
list_languages()
return
if not args.input:
logging.info('Please specify input file or directory')
exit(1)
if not Path(args.input).exists():
logging.info('File %s does not exist, please specify an existing file/directory' % (args.input))
exit(1)
if args.output !='' and not Path(args.output).exists():
logging.info('Output %s does not exist, please specify an existing file' % (args.output))
exit(1)
transcriber = Transcriber(args)
if Path(args.input).is_dir() and Path(args.output).is_dir():
transcriber.process_dir(args)
return
elif Path(args.input).is_file() and (args.output=='' or Path(args.output).is_file()):
transcriber.process_file(args)
else:
logging.info('Wrong arguments, input and output must be same type')
exit(1)
if __name__ == "__main__":
main()
@@ -0,0 +1,89 @@
import json
import subprocess
import srt
import datetime
import os
import logging
from pathlib import Path
from timeit import default_timer as timer
from vosk import KaldiRecognizer, Model
class Transcriber:
def __init__(self, args):
self.model = Model(model_path=args.model, model_name=args.model_name, lang=args.lang)
self.args = args
def recognize_stream(self, rec, stream):
tot_samples = 0
result = []
while True:
data = stream.stdout.read(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
tot_samples += len(data)
result.append(json.loads(rec.Result()))
result.append(json.loads(rec.FinalResult()))
return result, tot_samples
def format_result(self, result, words_per_line=7):
final_result = ''
if self.args.output_type == 'srt':
subs = []
for i, res in enumerate(result):
if not 'result' in res:
continue
words = res['result']
for j in range(0, len(words), words_per_line):
line = words[j : j + words_per_line]
s = srt.Subtitle(index=len(subs),
content = ' '.join([l['word'] for l in line]),
start=datetime.timedelta(seconds=line[0]['start']),
end=datetime.timedelta(seconds=line[-1]['end']))
subs.append(s)
final_result = srt.compose(subs)
elif self.args.output_type == 'txt':
for part in result:
final_result += part['text'] + ' '
return final_result
def resample_ffmpeg(self, infile):
stream = subprocess.Popen(
['ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i',
infile,
'-ar', '16000','-ac', '1', '-f', 's16le', '-'],
stdout=subprocess.PIPE)
return stream
def process_entry(self, inputdata):
logging.info(f'Recognizing {inputdata[0]}')
rec = KaldiRecognizer(self.model, 16000)
rec.SetWords(True)
stream = self.resample_ffmpeg(inputdata[0])
result, tot_samples = self.recognize_stream(rec, stream)
final_result = self.format_result(result)
if inputdata[1] != '':
with open(inputdata[1], 'w', encoding='utf-8') as fh:
fh.write(final_result)
else:
print(final_result)
return final_result, tot_samples
def process_directory(self,args):
task_list = [(Path(args.input, fn), Path(args.output, Path(fn).stem).with_suffix('.' + args.output_type)) for fn in os.listdir(args.input)]
with Pool() as pool:
pool.map(self.process_entry, file_list)
def process_file(self, args):
start_time = timer()
final_result, tot_samples = self.process_entry([args.input, args.output])
elapsed = timer() - start_time
logging.info(f'''Execution time: {elapsed:.3f} sec; xRT: {format(tot_samples / 16000.0 / float(elapsed), '.3f')}''')
@@ -0,0 +1,10 @@
# auto-generated file
import _cffi_backend
ffi = _cffi_backend.FFI('vosk.vosk_cffi',
_version = 0x2601,
_types = b'\x00\x00\x03\x0D\x00\x00\x00\x0F\x00\x00\x1B\x0D\x00\x00\x5B\x03\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x0A\x0D\x00\x00\x60\x03\x00\x00\x00\x0F\x00\x00\x1E\x0D\x00\x00\x5D\x03\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x1E\x0D\x00\x00\x0A\x11\x00\x00\x0D\x01\x00\x00\x5F\x03\x00\x00\x00\x0F\x00\x00\x1E\x0D\x00\x00\x0A\x11\x00\x00\x0D\x01\x00\x00\x07\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x07\x11\x00\x00\x00\x0F\x00\x00\x07\x0D\x00\x00\x5C\x03\x00\x00\x00\x0F\x00\x00\x07\x0D\x00\x00\x5E\x03\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1B\x11\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x0A\x11\x00\x00\x07\x11\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1E\x11\x00\x00\x07\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1E\x11\x00\x00\x04\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x2A\x0D\x00\x00\x1E\x11\x00\x00\x61\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x03\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1B\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1B\x11\x00\x00\x07\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x0A\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1E\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1E\x11\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x1E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x62\x0D\x00\x00\x00\x0F\x00\x00\x00\x09\x00\x00\x01\x09\x00\x00\x02\x09\x00\x00\x03\x09\x00\x00\x04\x09\x00\x00\x02\x01\x00\x00\x05\x01\x00\x00\x00\x01',
_globals = (b'\x00\x00\x36\x23vosk_batch_model_free',0,b'\x00\x00\x00\x23vosk_batch_model_new',0,b'\x00\x00\x36\x23vosk_batch_model_wait',0,b'\x00\x00\x3C\x23vosk_batch_recognizer_accept_waveform',0,b'\x00\x00\x39\x23vosk_batch_recognizer_finish_stream',0,b'\x00\x00\x39\x23vosk_batch_recognizer_free',0,b'\x00\x00\x1A\x23vosk_batch_recognizer_front_result',0,b'\x00\x00\x20\x23vosk_batch_recognizer_get_pending_chunks',0,b'\x00\x00\x02\x23vosk_batch_recognizer_new',0,b'\x00\x00\x39\x23vosk_batch_recognizer_pop',0,b'\x00\x00\x41\x23vosk_batch_recognizer_set_nlsml',0,b'\x00\x00\x59\x23vosk_gpu_init',0,b'\x00\x00\x59\x23vosk_gpu_thread_init',0,b'\x00\x00\x23\x23vosk_model_find_word',0,b'\x00\x00\x45\x23vosk_model_free',0,b'\x00\x00\x06\x23vosk_model_new',0,b'\x00\x00\x27\x23vosk_recognizer_accept_waveform',0,b'\x00\x00\x2C\x23vosk_recognizer_accept_waveform_f',0,b'\x00\x00\x31\x23vosk_recognizer_accept_waveform_s',0,b'\x00\x00\x1D\x23vosk_recognizer_final_result',0,b'\x00\x00\x48\x23vosk_recognizer_free',0,b'\x00\x00\x09\x23vosk_recognizer_new',0,b'\x00\x00\x12\x23vosk_recognizer_new_grm',0,b'\x00\x00\x0D\x23vosk_recognizer_new_spk',0,b'\x00\x00\x1D\x23vosk_recognizer_partial_result',0,b'\x00\x00\x48\x23vosk_recognizer_reset',0,b'\x00\x00\x1D\x23vosk_recognizer_result',0,b'\x00\x00\x4F\x23vosk_recognizer_set_max_alternatives',0,b'\x00\x00\x4F\x23vosk_recognizer_set_nlsml',0,b'\x00\x00\x4F\x23vosk_recognizer_set_partial_words',0,b'\x00\x00\x4B\x23vosk_recognizer_set_spk_model',0,b'\x00\x00\x4F\x23vosk_recognizer_set_words',0,b'\x00\x00\x56\x23vosk_set_log_level',0,b'\x00\x00\x53\x23vosk_spk_model_free',0,b'\x00\x00\x17\x23vosk_spk_model_new',0),
_struct_unions = ((b'\x00\x00\x00\x5B\x00\x00\x00\x10VoskBatchModel',),(b'\x00\x00\x00\x5C\x00\x00\x00\x10VoskBatchRecognizer',),(b'\x00\x00\x00\x5D\x00\x00\x00\x10VoskModel',),(b'\x00\x00\x00\x5E\x00\x00\x00\x10VoskRecognizer',),(b'\x00\x00\x00\x5F\x00\x00\x00\x10VoskSpkModel',)),
_typenames = (b'\x00\x00\x00\x5BVoskBatchModel',b'\x00\x00\x00\x5CVoskBatchRecognizer',b'\x00\x00\x00\x5DVoskModel',b'\x00\x00\x00\x5EVoskRecognizer',b'\x00\x00\x00\x5FVoskSpkModel'),
)