DeepSpeech RESTful API?

Has anyone created an implementation of their (completely trained) model which operates as an API? Either with flask or tensorflow-serving? I am only able to do single_shot implementations and loading the model each time takes a long time.

warnings.filterwarnings("ignore")

from multiprocessing import cpu_count
import tensorflow as tf

from ds_ctcdecoder import ctc_beam_search_decoder_batch, Scorer
from DeepSpeech import create_model, try_loading

from util.config import Config, initialize_globals
from util.feeding import audiofile_to_features
from util.flags import create_flags, FLAGS

import argparse
from datetime import datetime

print('request received {} \n'.format(datetime.utcnow()))


parser = argparse.ArgumentParser()

parser.add_argument('-a', '--audio', required=True, help='Required 8kHz 16bit PCM wav audio sample path.')


args = parser.parse_args()

lm_alpha=0.75
lm_beta=1.85
alphabet_config_path='../alphabet.txt'
lm_binary_path='../lm3.binary'
lm_trie_path= '../trie3'
audio_window_samples=256
audio_step_samples=160
n_input=26
n_context=9
beam_width=500
wav_filename = args.audio



def evaluate(wav_filename):
    print('scorer initialized {} \n'.format(datetime.utcnow()))
    scorer = Scorer(lm_alpha, lm_beta,
                    lm_binary_path, lm_trie_path,
                    Config.alphabet)

    print('prediction start {} \n'.format(datetime.utcnow()))
    features, features_len = audiofile_to_features(wav_filename)
    print('features initialized {} \n'.format(datetime.utcnow()))

    # Add batch dimension
    batch_x = tf.expand_dims(features, 0)
    batch_x_len = tf.expand_dims(features_len, 0)


    # One rate per layer
    no_dropout = [None] * 6
    logits, _ = create_model(batch_x=batch_x,
                             seq_length=batch_x_len,
                             dropout=no_dropout)
    print('model initialized {} \n'.format(datetime.utcnow()))

    # Transpose to batch major and apply softmax for decoder
    transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))


    tf.train.get_or_create_global_step()

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except NotImplementedError:
        num_processes = 1

    # Create a saver using variables from the above newly created graph
    saver = tf.train.Saver()

    with tf.Session(config=Config.session_config) as session:
        # Restore variables from training checkpoint
        loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation')
        print('session initialized {} \n'.format(datetime.utcnow()))


        # First pass, compute losses and transposed logits for decoding
        batch_logits,  batch_lengths = session.run([transposed,  batch_x_len])


        decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, 1024,
                                                num_processes=num_processes, scorer=scorer)

        predictions = [d[0][1] for d in decoded][0]
        print('prediction end {} \n'.format(datetime.utcnow()))

    return predictions


def main(_):
    initialize_globals()
    print('globals initialized {} \n'.format(datetime.utcnow()))

    return evaluate(wav_filename)

if __name__ == '__main__':
    create_flags()
    tf.app.run(main)```

Yes, several people have done so successfully, myself included.

The reason you get bad experience is because you are using the wrong tools. You should use the .pbmm model file and the deepspeech Python bindings, not the full blown TensorFlow code as you do.

Have a look at native_client/python/client.py, wrap that into flask and you’re done.

2 Likes