This repository was archived by the owner on Feb 27, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtranscribe-items.py
executable file
·177 lines (146 loc) · 5.66 KB
/
transcribe-items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python -u
"""Queue files listed in a manifest for AWS Transcribe"""
import argparse
import fileinput
import os
import sys
import tempfile
import time
from urllib.parse import urljoin, urlparse
import boto3
import botocore
import requests
SUPPORTED_FILE_TYPES = ("mp3", "mp4", "wav", "flac")
SUPPORTED_LANGUAGES = {"english": "en-US", "spanish": "es-US"}
s3 = boto3.client("s3")
transcribe = boto3.client("transcribe")
def transcribe_item(
job_name, s3_url, *, media_format="wav", language_code="en-US", vocabulary=None
):
settings = {}
if vocabulary:
settings["VocabularyName"] = vocabulary
return transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={"MediaFileUri": s3_url},
MediaFormat=media_format,
LanguageCode=language_code,
Settings=settings,
)
def upload_audio_to_s3(audio_url, bucket_name, dest_key):
try:
s3.head_object(Bucket=bucket_name, Key=dest_key)
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] != "404":
raise
else:
print(f"{dest_key} already exists; won't repeat upload", file=sys.stderr)
return
with tempfile.NamedTemporaryFile(mode="w+b") as local_temp:
r = requests.get(audio_url, stream=True, allow_redirects=True)
r.raise_for_status()
content_type = r.headers["Content-Type"]
for chunk in r.iter_content(chunk_size=256 * 1024):
local_temp.write(chunk)
local_temp.flush()
local_temp.seek(0)
s3.upload_fileobj(
local_temp, bucket_name, dest_key, ExtraArgs={"ContentType": content_type}
)
def main(bucket_name, files, *, vocabulary=None):
for line in fileinput.input(files):
if line.count("\t") != 5:
print("Skipping malformed line:", repr(line), file=sys.stderr)
continue
(
item_id,
language,
title,
url,
media_master_url,
media_stream_url,
) = map(str.strip, line.split("\t", 5))
# FIXME: add some configuration management
if os.path.exists(os.path.join("results", "%s.json" % item_id)):
print("Skipping completed item", item_id)
continue
if language not in SUPPORTED_LANGUAGES:
print(
"Transcribe currently does not support %s" % language, file=sys.stderr
)
continue
lang = SUPPORTED_LANGUAGES[language]
parsed_audio_url = urlparse(media_master_url)
_, file_ext = os.path.splitext(parsed_audio_url.path)
file_ext = file_ext.strip(".")
if file_ext not in SUPPORTED_FILE_TYPES:
print("Transcribe does not support %s files" % file_ext, file=sys.stderr)
continue
try:
existing_job = transcribe.get_transcription_job(
TranscriptionJobName=item_id
)
print(
"Not reprocessing existing job %s: %s"
% (item_id, existing_job["TranscriptionJob"]["TranscriptionJobStatus"])
)
continue
except botocore.exceptions.ClientError:
# Unfortunately the AWS response doesn't give a better option to
# disambiguate error conditions than scraping error messages but
# we'll punt any other error to the start job error path
pass
if media_master_url.startswith("s3://"):
s3_url = media_master_url
else:
s3_path = f"{item_id}.{file_ext}"
if not bucket_name:
print(
f"{item_id} needs to be uploaded to S3 but the bucket name was not"
" provided. Perhaps you need to add --bucket?",
file=sys.stderr,
)
continue
print(f"Uploading {item_id} “{title}” to {s3_path}…")
try:
upload_audio_to_s3(media_master_url, bucket_name, s3_path)
except Exception as exc:
print(f"Unable to upload {media_master_url}: {exc}", file=sys.stderr)
continue
s3_url = urljoin(s3.meta.endpoint_url, "%s/%s" % (bucket_name, s3_path))
print(f"Transcribing {item_id} from {s3_url}")
for i in range(0, 30):
try:
transcribe_item(
item_id,
s3_url,
media_format=file_ext,
language_code=lang,
vocabulary=vocabulary,
)
break
except botocore.exceptions.ClientError as exc:
if exc.response["Error"]["Code"] == "400":
print("Rate-limiting…")
time.sleep(1 + (5 * i))
continue
else:
print(
f"Unexpected API response for {item_id}: {exc}", file=sys.stderr
)
break
except Exception as exc:
print(f"Unable to transcribe {item_id}: {exc}", file=sys.stderr)
continue
time.sleep(
0.5
) # FIXME: develop a more comprehensive solution for AWS throttling
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__.strip())
parser.add_argument("--bucket", help="S3 bucket name which Transcribe can access")
parser.add_argument(
"--vocabulary", help="Optional custom vocabulary for Transcribe to use"
)
parser.add_argument("files", nargs="+")
args = parser.parse_args()
main(args.bucket, args.files, vocabulary=args.vocabulary)