First you might need to install these
! pip install -U mock pytube openai-whisper
Pull a Youtube video down and extract the audio
import re
import mock
from pytube.cipher import get_throttling_function_code
def patched_throttling_plan(js: str):
"""Patch throttling plan, from https://github.com/pytube/pytube/issues/1498"""
raw_code = get_throttling_function_code(js)
transform_start = r"try{"
plan_regex = re.compile(transform_start)
match = plan_regex.search(raw_code)
#transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
transform_plan_raw = js
# Steps are either c[x](c[y]) or c[x](c[y],c[z])
step_start = r"c\[(\d+)\]\(c\[(\d+)\](,c(\[(\d+)\]))?\)"
step_regex = re.compile(step_start)
matches = step_regex.findall(transform_plan_raw)
transform_steps = []
for match in matches:
if match[4] != '':
transform_steps.append((match[0],match[1],match[4]))
else:
transform_steps.append((match[0],match[1]))
return transform_steps
with mock.patch('pytube.cipher.get_throttling_plan', patched_throttling_plan):
from pytube import YouTube
url = 'https://www.youtube.com/watch?v=ZBVrPWwSlRM'
video = YouTube(url)
audio = video.streams.filter(only_audio=True, file_extension='mp4')[0]
audio.download(filename='team-rocket.mp4')
And if you would like to play it to make sure the audio is correct in Jupyter:
from IPython.display import Audio, display
display(Audio('team-rocket.mp4', autoplay=True))
Transcribe the audio to text
import whisper
model = whisper.load_model("base")
result = model.transcribe('team-rocket.mp4')
print(result["text"])
[out]:
Prepare for trouble! Make it double! Super tip the world from devastation! To unite all peoples within our nation! To denounce the evils of truth and love! To extend our reach to the stars above! Jesse! James! Team Rocket blast off at the speed of life! So then to now, we're prepared to fight! Be out! That's right! Let's...
Note: The output of ASR is definitely not perfect but very much usable as a first draft that requires some manual post-edits. Please DO NOT submit the subs directly to your fav fan-subs forum, it'll surely fail the QC moderator.
But how about the timestamps?
Then we need a little more sophistication than crudely pulling the audio and then doing Automatic Speech Recognition (ASR)
Here's a good article: https://blog.searce.com/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd
And also we are blessed in today's age of AI... Tada: https://github.com/linto-ai/whisper-timestamped
! pip install -U git+https://github.com/linto-ai/whisper-timestamped
Then in code:
import whisper_timestamped as whisper
audio = whisper.load_audio('team-rocket.mp4')
model = whisper.load_model("base")
result = whisper.transcribe(model, audio, language="en")
import json
print(json.dumps(result, indent = 2, ensure_ascii = False))
[out]:
{
"text": " Prepare for trouble! Make it double! Super tip the world from devastation! To unite all peoples within our nation! To denounce the evils of truth and love! To extend our reach to the stars above! Jesse! James! Team Rocket blast off at the speed of life! So then to now, we're prepared to fight! Be out! That's right! Let's...",
"segments": [
{
"id": 0,
"seek": 0,
"start": 0.22,
"end": 1.66,
"text": " Prepare for trouble!",
"tokens": [
50364,
29689,
337,
5253,
0,
50464
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.873,
"words": [
{
"text": "Prepare",
"start": 0.22,
"end": 0.68,
"confidence": 0.726
},
{
"text": "for",
"start": 0.68,
"end": 1.1,
"confidence": 0.982
},
{
"text": "trouble!",
"start": 1.1,
"end": 1.66,
"confidence": 0.933
}
]
},
{
"id": 1,
"seek": 0,
"start": 2.48,
"end": 3.72,
"text": " Make it double!",
"tokens": [
50464,
4387,
309,
3834,
0,
50564
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.878,
"words": [
{
"text": "Make",
"start": 2.48,
"end": 2.84,
"confidence": 0.979
},
{
"text": "it",
"start": 2.84,
"end": 3.12,
"confidence": 0.995
},
{
"text": "double!",
"start": 3.12,
"end": 3.72,
"confidence": 0.694
}
]
},
{
"id": 2,
"seek": 0,
"start": 4.2,
"end": 6.06,
"text": " Super tip the world from devastation!",
"tokens": [
50564,
4548,
4125,
264,
1002,
490,
13959,
399,
0,
50664
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.709,
"words": [
{
"text": "Super",
"start": 4.2,
"end": 4.44,
"confidence": 0.728
},
{
"text": "tip",
"start": 4.44,
"end": 4.66,
"confidence": 0.195
},
{
"text": "the",
"start": 4.66,
"end": 4.86,
"confidence": 0.73
},
{
"text": "world",
"start": 4.86,
"end": 5.12,
"confidence": 0.913
},
{
"text": "from",
"start": 5.12,
"end": 5.36,
"confidence": 0.966
},
{
"text": "devastation!",
"start": 5.36,
"end": 6.06,
"confidence": 0.991
}
]
},
{
"id": 3,
"seek": 0,
"start": 6.4,
"end": 8.78,
"text": " To unite all peoples within our nation!",
"tokens": [
50664,
1407,
29320,
439,
16915,
1951,
527,
4790,
0,
50814
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.943,
"words": [
{
"text": "To",
"start": 6.4,
"end": 6.52,
"confidence": 0.772
},
{
"text": "unite",
"start": 6.52,
"end": 6.88,
"confidence": 0.991
},
{
"text": "all",
"start": 6.88,
"end": 7.22,
"confidence": 0.992
},
{
"text": "peoples",
"start": 7.22,
"end": 7.64,
"confidence": 0.941
},
{
"text": "within",
"start": 7.64,
"end": 8.08,
"confidence": 0.97
},
{
"text": "our",
"start": 8.08,
"end": 8.28,
"confidence": 0.995
},
{
"text": "nation!",
"start": 8.28,
"end": 8.78,
"confidence": 0.96
}
]
},
{
"id": 4,
"seek": 0,
"start": 9.26,
"end": 11.26,
"text": " To denounce the evils of truth and love!",
"tokens": [
50814,
1407,
1441,
7826,
264,
1073,
4174,
295,
3494,
293,
959,
0,
50914
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.883,
"words": [
{
"text": "To",
"start": 9.26,
"end": 9.44,
"confidence": 0.684
},
{
"text": "denounce",
"start": 9.44,
"end": 9.96,
"confidence": 0.739
},
{
"text": "the",
"start": 9.96,
"end": 10.14,
"confidence": 0.988
},
{
"text": "evils",
"start": 10.14,
"end": 10.44,
"confidence": 0.942
},
{
"text": "of",
"start": 10.44,
"end": 10.58,
"confidence": 0.995
},
{
"text": "truth",
"start": 10.58,
"end": 10.8,
"confidence": 0.906
},
{
"text": "and",
"start": 10.8,
"end": 11.02,
"confidence": 0.986
},
{
"text": "love!",
"start": 11.02,
"end": 11.26,
"confidence": 0.987
}
]
},
{
"id": 5,
"seek": 0,
"start": 11.44,
"end": 13.22,
"text": " To extend our reach to the stars above!",
"tokens": [
50914,
1407,
10101,
527,
2524,
281,
264,
6105,
3673,
0,
51014
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.992,
"words": [
{
"text": "To",
"start": 11.44,
"end": 11.52,
"confidence": 0.993
},
{
"text": "extend",
"start": 11.52,
"end": 11.88,
"confidence": 0.996
},
{
"text": "our",
"start": 11.88,
"end": 12.1,
"confidence": 0.989
},
{
"text": "reach",
"start": 12.1,
"end": 12.32,
"confidence": 0.988
},
{
"text": "to",
"start": 12.32,
"end": 12.52,
"confidence": 0.997
},
{
"text": "the",
"start": 12.52,
"end": 12.62,
"confidence": 0.993
},
{
"text": "stars",
"start": 12.62,
"end": 12.86,
"confidence": 0.988
},
{
"text": "above!",
"start": 12.86,
"end": 13.22,
"confidence": 0.993
}
]
},
{
"id": 6,
"seek": 0,
"start": 13.48,
"end": 14.16,
"text": " Jesse!",
"tokens": [
51014,
21895,
0,
51064
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.211,
"words": [
{
"text": "Jesse!",
"start": 13.48,
"end": 14.16,
"confidence": 0.211
}
]
},
{
"id": 7,
"seek": 0,
"start": 14.48,
"end": 15.22,
"text": " James!",
"tokens": [
51064,
5678,
0,
51114
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.874,
"words": [
{
"text": "James!",
"start": 14.48,
"end": 15.22,
"confidence": 0.874
}
]
},
{
"id": 8,
"seek": 0,
"start": 15.48,
"end": 18.42,
"text": " Team Rocket blast off at the speed of life!",
"tokens": [
51114,
7606,
29651,
12035,
766,
412,
264,
3073,
295,
993,
0,
51264
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.711,
"words": [
{
"text": "Team",
"start": 15.48,
"end": 16.22,
"confidence": 0.147
},
{
"text": "Rocket",
"start": 16.22,
"end": 16.66,
"confidence": 0.919
},
{
"text": "blast",
"start": 16.66,
"end": 17.06,
"confidence": 0.774
},
{
"text": "off",
"start": 17.06,
"end": 17.36,
"confidence": 0.722
},
{
"text": "at",
"start": 17.36,
"end": 17.5,
"confidence": 0.946
},
{
"text": "the",
"start": 17.5,
"end": 17.62,
"confidence": 0.993
},
{
"text": "speed",
"start": 17.62,
"end": 17.96,
"confidence": 0.983
},
{
"text": "of",
"start": 17.96,
"end": 18.2,
"confidence": 0.998
},
{
"text": "life!",
"start": 18.2,
"end": 18.42,
"confidence": 0.667
}
]
},
{
"id": 9,
"seek": 0,
"start": 18.48,
"end": 20.96,
"text": " So then to now, we're prepared to fight!",
"tokens": [
51264,
407,
550,
281,
586,
11,
321,
434,
4927,
281,
2092,
0,
51414
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.79,
"words": [
{
"text": "So",
"start": 18.48,
"end": 19.0,
"confidence": 0.753
},
{
"text": "then",
"start": 19.0,
"end": 19.18,
"confidence": 0.564
},
{
"text": "to",
"start": 19.18,
"end": 19.34,
"confidence": 0.829
},
{
"text": "now,",
"start": 19.34,
"end": 19.66,
"confidence": 0.984
},
{
"text": "we're",
"start": 19.9,
"end": 20.14,
"confidence": 0.597
},
{
"text": "prepared",
"start": 20.14,
"end": 20.3,
"confidence": 0.973
},
{
"text": "to",
"start": 20.3,
"end": 20.56,
"confidence": 0.998
},
{
"text": "fight!",
"start": 20.56,
"end": 20.96,
"confidence": 0.996
}
]
},
{
"id": 10,
"seek": 0,
"start": 21.1,
"end": 23.28,
"text": " Be out! That's right! Let's...",
"tokens": [
51414,
879,
484,
0,
663,
311,
558,
0,
961,
311,
485,
51514
],
"temperature": 0.0,
"avg_logprob": -0.24259458803663067,
"compression_ratio": 1.5330188679245282,
"no_speech_prob": 0.7252825498580933,
"confidence": 0.795,
"words": [
{
"text": "Be",
"start": 21.1,
"end": 21.28,
"confidence": 0.462
},
{
"text": "out!",
"start": 21.28,
"end": 21.74,
"confidence": 0.807
},
{
"text": "That's",
"start": 22.02,
"end": 22.36,
"confidence": 0.958
},
{
"text": "right!",
"start": 22.36,
"end": 22.64,
"confidence": 0.992
},
{
"text": "Let's...",
"start": 22.86,
"end": 23.28,
"confidence": 0.769
}
]
}
],
"language": "en"
}
You've gone so far with some fine-grained JSON and timestamps, how about some code to convert the JSON to .srt file?
for i, segment in enumerate(result['segments']):
start, end = segment['start'], segment['end']
print(i)
print(f"00:00:{str(int(start)).replace('.', ',')} --> 00:00:{str(int(end)).replace('.', ',')}")
print(segment['text'].strip())
print()
[out]:
0
00:00:0 --> 00:00:1
Prepare for trouble!
1
00:00:2 --> 00:00:3
Make it double!
2
00:00:4 --> 00:00:6
Super tip the world from devastation!
3
00:00:6 --> 00:00:8
To unite all peoples within our nation!
4
00:00:9 --> 00:00:11
To denounce the evils of truth and love!
5
00:00:11 --> 00:00:13
To extend our reach to the stars above!
6
00:00:13 --> 00:00:14
Jesse!
7
00:00:14 --> 00:00:15
James!
8
00:00:15 --> 00:00:18
Team Rocket blast off at the speed of life!
9
00:00:18 --> 00:00:20
So then to now, we're prepared to fight!
10
00:00:21 --> 00:00:23
Be out! That's right! Let's...
小等咧! (Wait a minute!), you've hard-coded the hours and minutes... What do I do for longer videos?
I'm sure with the JSON output from whisper-timestamped
you can easily figure out the conversion. Hint: from datetime import timedelta; str(timedelta(seconds=float(start)))
Have fun munging the data to the desired format you need!