How can I bulk/batch transcribe wav files using python?

F

4

6

im trying to use my python app to transcribe multiple files in a folder and speed up the process. At present I am able to do it one file at a time -

####RUN THIS PART FIRST#########
import json
from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import threading
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd
authenticator = IAMAuthenticator('xxyyzz')

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')

models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))

# This is the name of the file u need to change below
with open(join(dirname('__file__'), 'Call 8.wav'),
          'rb') as audio_file:
#    print(json.dumps(
    output = service.recognize(
    audio=audio_file,
    speaker_labels=True,
    content_type='audio/wav',
    #timestamps=True,
    #word_confidence=True,
    inactivity_timeout = -1,
    model='en-US_NarrowbandModel',
    continuous=True).get_result(),
    indent=2
  ############END################################  

# get data to a csv
########################RUN THIS PART SECOND#####################################
df0 = pd.DataFrame([i for elts in output for alts in elts['results'] for i in alts['alternatives']])

df1 = pd.DataFrame([i for elts in output for i in elts['speaker_labels']])

list(df0.columns) 
list(df1.columns) 
df0 = df0.drop(["timestamps"], axis=1)
df1 = df1.drop(["final"], axis=1)
df1 = df1.drop(['confidence'],axis=1)
test3 = pd.concat([df0, df1], axis=1)
#sentiment
transcript = test3['transcript']
transcript = transcript.dropna()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
text = transcript
scores = []
for txt in text:
    vs = analyzer.polarity_scores(txt)
    scores.append(vs)
data = pd.DataFrame(text, columns= ['Text'])
data2 = pd.DataFrame(scores)
final_dataset= pd.concat([data,data2], axis=1)
test4 = pd.concat([test3,final_dataset], axis=1)
test4 = test4.drop(['Text'],axis=1)
test4.rename(columns={'neg':'Negative'}, 
                 inplace=True)
test4.rename(columns={'pos':'Positive'}, 
                 inplace=True)
test4.rename(columns={'neu':'Neutral'}, 
                 inplace=True)

# This is the name of the output csv file
test4.to_csv("Call 8.csv")

How can i do this to transcribe multiple files in a folder instead of one file at a time?I can run this script multiple times but i want to automate it such that it picks up wav files from a folder and runs it. lets say I have 15 audio wav files in my folder C:\Python. I want to make it an automated process where it will run the script and get 15 csvs. 1 for each with their resp. outputs. right now this script works but have to manually run it for each wav file to get each wavs output csv.

Also,as a second question(sorry!), is there a way to speed up the transcription? breakup the wav files into smaller segments and send to watson but it didnt work. My reference was - (https://github.com/freelanceastro/interview-transcriber)

Feodora answered 3/3, 2021 at 4:58 Comment(3)

Add what you have tried, e.g. have you send multiple requests in parallel? If yes, what is the error you are seeing? – Kalk 3/3, 2021 at 7:19

I havent sent in parallel. How can I do it for multiple files in a folder? – Feodora 3/3, 2021 at 7:50

Python has some multi-processing / async-processing modules and features. – Kalk 3/3, 2021 at 8:47

B

0

I think I might have something:

import os
import json
import time
# import threading
from pathlib import Path

import concurrent.futures

# from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd

# Replace with your api key.
my_api_key = "abc123"

# You can add a directory path to Path() if you want to run
# the project from a different folder at some point.
directory = Path().absolute()


authenticator = IAMAuthenticator(my_api_key)

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')
# I used this URL.
# service.set_service_url('https://stream.watsonplatform.net/speech-to-text/api') 


models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))



# get data to a csv
########################RUN THIS PART SECOND#####################################


def process_data(json_data, output_path):

    print(f"Processing: {output_path.stem}")

    cols = ["transcript", "confidence"]

    dfdata = [[t[cols[0]], t[cols[1]]] for r in json_data.get('results') for t in r.get("alternatives")]

    df0 = pd.DataFrame(data = dfdata, columns = cols)

    df1 = pd.DataFrame(json_data.get("speaker_labels")).drop(["final", "confidence"], axis=1)


    # test3 = pd.concat([df0, df1], axis=1)
    test3 = pd.merge(df0, df1, left_index = True, right_index = True)


    # sentiment
    print(f"Getting sentiment for: {output_path.stem}")
    transcript = test3["transcript"]
    transcript.dropna(inplace=True)

    analyzer = SentimentIntensityAnalyzer()
    text = transcript
    scores = [analyzer.polarity_scores(txt) for txt in text]

    # data = pd.DataFrame(text, columns = ["Text"])
    data = transcript.to_frame(name="Text")
    data2 = pd.DataFrame(scores)


    # final_dataset= pd.concat([data, data2], axis=1)
    final_dataset = pd.merge(data, data2, left_index = True, right_index = True)

    # test4 = pd.concat([test3, final_dataset], axis=1)
    test4 = pd.merge(test3, final_dataset, left_index = True, right_index = True)

    test4.drop("Text", axis=1, inplace=True)

    test4.rename(columns = {
            "neg": "Negative",
            "pos": "Positive",
            "neu": "Neutral",
            }, inplace=True)

    # This is the name of the output csv file
    test4.to_csv(output_path, index = False)


def process_audio_file(filename, output_type = "csv"):

    audio_file_path = directory.joinpath(filename)

    # Update output path to consider `output_type` parameter.
    out_path = directory.joinpath(f"{audio_file_path.stem}.{output_type}")

    print(f"Current file: '{filename}'")

    with open(audio_file_path, "rb") as audio_file:
        data = service.recognize(
                audio = audio_file,
                speaker_labels = True,
                content_type = "audio/wav",
                inactivity_timeout = -1,
                model = "en-US_NarrowbandModel",
                continuous = True,
            ).get_result()

    print(f"Speech-to-text complete for: '{filename}'")

    # Return data and output path as collection.
    return [data, out_path]


def main():
    print("Running main()...")

    # Default num. workers == min(32, os.cpu_count() + 4)
    n_workers = os.cpu_count() + 2

    # Create generator for all .wav files in folder (and subfolders).
    file_gen = directory.glob("**/*.wav")

    with concurrent.futures.ThreadPoolExecutor(max_workers = n_workers) as executor:
        futures = {executor.submit(process_audio_file, f) for f in file_gen}
        for future in concurrent.futures.as_completed(futures):
            pkg = future.result()
            process_data(*pkg)


if __name__ == "__main__":

    print(f"Program to process audio files has started.")

    t_start = time.perf_counter()

    main()

    t_stop = time.perf_counter()
    print(f"Done! Processing completed in {t_stop - t_start} seconds.")

Brougham answered 6/3, 2021 at 6:17 Comment(10)

Hi Mark, thank you for your answer. I tested it on 3 wav files and it worked in 55 seconds even though the total amount of minutes of those wav files amounted to over 4 minutes.Approving this answer as it satisfies both halves of the question. – Feodora 6/3, 2021 at 22:51

Thanks! That was pretty fun. This was probably my second time using IBM Watson, first time for speech-to-text. Definitely an interesting platform that I might have to look into more. – Brougham 6/3, 2021 at 23:11

Mark. yes I prefer Watson to AWS and Gcloud. Its a cost friendly API too. Thanks for your input! My next mission is to improve the accuracy of the model. They do let you customize based on what your business needs are! – Feodora 6/3, 2021 at 23:41

Hey Mark, i have a couple of questions(I can post this as a new question on SO if you like) - 1) I am trying to get this script to work in a shiny app and want to only return a dataframe, so users have the option to download the csv instead of automatically generating one. How could i change the script to do that? 2) If i did not want to transcribe multiple files and just wanted to do it one a time like my original app, but wanted to add your speed enhancements so it can transcribe faster for even a single file, what should i change in my original code?. Truly appreciate your help – Feodora 23/3, 2021 at 15:29

Hi. I'm not super familiar with Shiny (the R library, right?), but if you can run a subprocess (command line) script from Shiny, you should be able to execute this script and make it available somehow. Possibly local storage? Wherever the script outputs the csv file to is where you'd direct the link. – Brougham 23/3, 2021 at 16:23

For the second question, I think that might be limited to Watson's capabilities and the Internet connection. If it takes ~30 seconds to process a file, but you can do multiple files at once, then the total time should still be ~30 seconds. But, this answer might also help out: https://mcmap.net/q/1918831/-how-to-use-multiprocessing-to-loop-through-a-big-list-of-url – Brougham 23/3, 2021 at 16:28

Hey Mark,thanks for your response. i think i wasnt able to communicate my questions well. for the first question. Lets forget about the Rshiny app for now. My question is how will i return the output as a dataframe and not a csv. I just want the final output to be in a dataframe format without writing it to a csv. For my second question, its not about the speed it runs at a great speed already(thanks to you :) I meant, how could i add your speed enhancements to my original app, where i am not transcribing batches(multiple files), but one wav at a time. – Feodora 23/3, 2021 at 16:40

Mark, also, please recommend if i can do to make my transcription more accurate. – Feodora 25/3, 2021 at 1:2

Hi. I think that if you just change test4.to_csv(output_path, index = False) to return test4 in the process_data() function, then that should output the complete dataframe. – Brougham 25/3, 2021 at 2:58

I might still not understand the second question. Do you mean just a more-efficient process_data() function? If so, there's some decent documentation on the Pandas website about enhancements and optimizations. Using Numba and eval() methods are pretty good starting points. Link -> pandas.pydata.org/pandas-docs/stable/user_guide/… – Brougham 25/3, 2021 at 3:0

D

0

have you tried running this script multiple times? you could write a wrapper that launches this script in a subprocess kinda like this:

import subprocess
import sys

processes = []
for _ in range(5):
    processes.append(subprocess.Popen([sys.executable, "/path/to/script.py"]))

# now wait for them to finish
for process in processes:
    process.wait()

Drawbar answered 3/3, 2021 at 16:48 Comment(1)

I can run this script multiple times but i want to automate it such that it picks up wav files from a folder and runs it. lets say i have 15 audio files in my folder C:\Python. I want to make it an automated process where it will run the script and get 15 csvs. 1 for each with their resp. outputs. right now this script works but have to manually run it for each wav file. – Feodora 3/3, 2021 at 17:22

V

0

Seems like you want to find all .wav files in a directory and process each in turn.

import os

for filename in os.listdir(os.getcwd()):
    if filename.endswith('.wav'):
        with open(filename, 'rb') as audio_file:

You could even extend it so that it keeps running and only processes new files.

Varus answered 4/3, 2021 at 10:30 Comment(5)

Hey chugts,excuse my elementary question but where inside my existing script do i put your code exactly to test it? – Feodora 4/3, 2021 at 18:9

line up the with open( with the with open( in your code. – Varus 4/3, 2021 at 18:19

I tried this approach but it still only gives an output of 1 csv. I tested with 5 audio files and i received only 1 csv. I should be received 5 csvs with each wavs transcription output in each csv. – Feodora 5/3, 2021 at 23:8

Most likely because you are writing to the same csv file each time. – Varus 6/3, 2021 at 21:3

do you have any suggestions on improving accuracy of transcription. im currently on the lite plan and this is a base model – Feodora 25/3, 2021 at 2:2

D

0

You can just try to turn your code into a function, scanning for all files with the .wav extension in your current directory (using os like some previously mentioned or glob), and calling this function for every file. It would result in something like this:

####RUN THIS PART FIRST#########
import json
from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import threading
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd
import glob

authenticator = IAMAuthenticator('xxyyzz')

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')

models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))

def transcribe(infile, service):
    # This is the name of the file u need to change below
    with open(infile,'rb') as audio_file:
    #    print(json.dumps(
        output = service.recognize(
        audio=audio_file,
        speaker_labels=True,
        content_type='audio/wav',
        #timestamps=True,
        #word_confidence=True,
        inactivity_timeout = -1,
        model='en-US_NarrowbandModel',
        continuous=True).get_result(),
        indent=2
      ############END################################  

    # get data to a csv
    ########################RUN THIS PART SECOND#####################################
    df0 = pd.DataFrame([i for elts in output for alts in elts['results'] for i in alts['alternatives']])

    df1 = pd.DataFrame([i for elts in output for i in elts['speaker_labels']])

    list(df0.columns) 
    list(df1.columns) 
    df0 = df0.drop(["timestamps"], axis=1)
    df1 = df1.drop(["final"], axis=1)
    df1 = df1.drop(['confidence'],axis=1)
    test3 = pd.concat([df0, df1], axis=1)
    #sentiment
    transcript = test3['transcript']
    transcript = transcript.dropna()
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    text = transcript
    scores = []
    for txt in text:
        vs = analyzer.polarity_scores(txt)
        scores.append(vs)
    data = pd.DataFrame(text, columns= ['Text'])
    data2 = pd.DataFrame(scores)
    final_dataset= pd.concat([data,data2], axis=1)
    test4 = pd.concat([test3,final_dataset], axis=1)
    test4 = test4.drop(['Text'],axis=1)
    test4.rename(columns={'neg':'Negative'}, 
                     inplace=True)
    test4.rename(columns={'pos':'Positive'}, 
                     inplace=True)
    test4.rename(columns={'neu':'Neutral'}, 
                     inplace=True)

    # This is the name of the output csv file
    test4.to_csv(infile[:-4] + ".csv")

for i in glob.glob("*.wav"):
    transcribe(i, service)

Daystar answered 6/3, 2021 at 0:22 Comment(7)

To make @hartville-zillow happy, you should update this to look in the same directory as __file__. I think you also need an * before .wav in the file search. And it would probably be a good idea to only replace .wav with .csv at the end of the file name when saving, rather than replacing everywhere in the filename (which will mess up filenames like "my.big.wave.file.wav"). – Vitellin 6/3, 2021 at 0:30

Oh yes the * was definitely missed by accident. Also, good suggestions! I will implement them. – Daystar 6/3, 2021 at 0:31

Hi @Charbelabidaher thanks for your answer. I tried running the script u posted and it ran successfully without error. However, I do not see any output csv files in the directory i used to transcribe wav files from. there is no output – Feodora 6/3, 2021 at 0:36

@hartvillezillow could you try running this edited code again? – Daystar 6/3, 2021 at 0:40

Hey @Charbelabidaher it works! thank you so much. I marked it correct. Any idea on how to solve the second part of the question. would really appreciate it. P.S took me a while to respond because it is taking that long to transcribe – Feodora 6/3, 2021 at 0:48

Thank you for accepting my answer! I will attempt to find ways to make it faster by sending requests is parallel as suggested in other comments, however am I not sure on how successful that can be. – Daystar 6/3, 2021 at 0:54

@Charbelabidaher do you have any suggestions to improve accuracy of transcribed words? – Feodora 25/3, 2021 at 2:1

B

0

I think I might have something:

import os
import json
import time
# import threading
from pathlib import Path

import concurrent.futures

# from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd

# Replace with your api key.
my_api_key = "abc123"

# You can add a directory path to Path() if you want to run
# the project from a different folder at some point.
directory = Path().absolute()


authenticator = IAMAuthenticator(my_api_key)

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')
# I used this URL.
# service.set_service_url('https://stream.watsonplatform.net/speech-to-text/api') 


models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))



# get data to a csv
########################RUN THIS PART SECOND#####################################


def process_data(json_data, output_path):

    print(f"Processing: {output_path.stem}")

    cols = ["transcript", "confidence"]

    dfdata = [[t[cols[0]], t[cols[1]]] for r in json_data.get('results') for t in r.get("alternatives")]

    df0 = pd.DataFrame(data = dfdata, columns = cols)

    df1 = pd.DataFrame(json_data.get("speaker_labels")).drop(["final", "confidence"], axis=1)


    # test3 = pd.concat([df0, df1], axis=1)
    test3 = pd.merge(df0, df1, left_index = True, right_index = True)


    # sentiment
    print(f"Getting sentiment for: {output_path.stem}")
    transcript = test3["transcript"]
    transcript.dropna(inplace=True)

    analyzer = SentimentIntensityAnalyzer()
    text = transcript
    scores = [analyzer.polarity_scores(txt) for txt in text]

    # data = pd.DataFrame(text, columns = ["Text"])
    data = transcript.to_frame(name="Text")
    data2 = pd.DataFrame(scores)


    # final_dataset= pd.concat([data, data2], axis=1)
    final_dataset = pd.merge(data, data2, left_index = True, right_index = True)

    # test4 = pd.concat([test3, final_dataset], axis=1)
    test4 = pd.merge(test3, final_dataset, left_index = True, right_index = True)

    test4.drop("Text", axis=1, inplace=True)

    test4.rename(columns = {
            "neg": "Negative",
            "pos": "Positive",
            "neu": "Neutral",
            }, inplace=True)

    # This is the name of the output csv file
    test4.to_csv(output_path, index = False)


def process_audio_file(filename, output_type = "csv"):

    audio_file_path = directory.joinpath(filename)

    # Update output path to consider `output_type` parameter.
    out_path = directory.joinpath(f"{audio_file_path.stem}.{output_type}")

    print(f"Current file: '{filename}'")

    with open(audio_file_path, "rb") as audio_file:
        data = service.recognize(
                audio = audio_file,
                speaker_labels = True,
                content_type = "audio/wav",
                inactivity_timeout = -1,
                model = "en-US_NarrowbandModel",
                continuous = True,
            ).get_result()

    print(f"Speech-to-text complete for: '{filename}'")

    # Return data and output path as collection.
    return [data, out_path]


def main():
    print("Running main()...")

    # Default num. workers == min(32, os.cpu_count() + 4)
    n_workers = os.cpu_count() + 2

    # Create generator for all .wav files in folder (and subfolders).
    file_gen = directory.glob("**/*.wav")

    with concurrent.futures.ThreadPoolExecutor(max_workers = n_workers) as executor:
        futures = {executor.submit(process_audio_file, f) for f in file_gen}
        for future in concurrent.futures.as_completed(futures):
            pkg = future.result()
            process_data(*pkg)


if __name__ == "__main__":

    print(f"Program to process audio files has started.")

    t_start = time.perf_counter()

    main()

    t_stop = time.perf_counter()
    print(f"Done! Processing completed in {t_stop - t_start} seconds.")