Skip to content

Commit 538ca89

Browse files
authored
Merge pull request #163 from pettarin/devel
Candidate v1.7.2
2 parents 29016f1 + f9cf672 commit 538ca89

17 files changed

+239
-152
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
**aeneas** is a Python/C library and a set of tools to automagically synchronize audio and text (aka forced alignment).
44

55
* Version: 1.7.2
6-
* Date: 2017-??-??
6+
* Date: 2017-03-03
77
* Developed by: [ReadBeyond](http://www.readbeyond.it/)
88
* Lead Developer: [Alberto Pettarin](http://www.albertopettarin.it/)
99
* License: the GNU Affero General Public License Version 3 (AGPL v3)
@@ -316,7 +316,7 @@ No copy rights were harmed in the making of this project.
316316
317317
* **April 2016**: the Fruch Foundation kindly sponsored the development and documentation of v1.5.0
318318
319-
* **December 2016**: the [Centro Internazionale Del Libro Parlato "Adriano Sernagiotto"](http://www.libroparlato.org/) (Feltre, Italy) partially sponsored the development of v1.7.0
319+
* **December 2016**: the [Centro Internazionale Del Libro Parlato "Adriano Sernagiotto"](http://www.libroparlato.org/) (Feltre, Italy) partially sponsored the development of v1.7.0, v1.7.1, and v1.7.2
320320
321321
### Supporting
322322

README.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ aeneas
55
synchronize audio and text (aka forced alignment).
66

77
- Version: 1.7.2
8-
- Date: 2017-??-??
8+
- Date: 2017-03-03
99
- Developed by: `ReadBeyond <http://www.readbeyond.it/>`__
1010
- Lead Developer: `Alberto Pettarin <http://www.albertopettarin.it/>`__
1111
- License: the GNU Affero General Public License Version 3 (AGPL v3)
@@ -359,7 +359,8 @@ Sponsors
359359

360360
- **December 2016**: the `Centro Internazionale Del Libro Parlato
361361
"Adriano Sernagiotto" <http://www.libroparlato.org/>`__ (Feltre,
362-
Italy) partially sponsored the development of v1.7.0
362+
Italy) partially sponsored the development of v1.7.0, v1.7.1, and
363+
v1.7.2
363364

364365
Supporting
365366
~~~~~~~~~~

aeneas/diagnostics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def check_tools(cls):
167167
"""
168168
try:
169169
from aeneas.tools.convert_syncmap import ConvertSyncMapCLI
170-
# disabling this check, as it requires the optional dependency pafy
170+
# disabling this check, as it requires the optional dependency youtube-dl
171171
# COMMENTED from aeneas.tools.download import DownloadCLI
172172
from aeneas.tools.execute_job import ExecuteJobCLI
173173
from aeneas.tools.execute_task import ExecuteTaskCLI

aeneas/downloader.py

Lines changed: 151 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,36 @@
2424
"""
2525
This module contains the following classes:
2626
27+
* :class:`~aeneas.downloader.DownloadError`, which represents an error occurred
28+
while downloading a Web resource.
2729
* :class:`~aeneas.downloader.Downloader`, which download files from various Web sources.
2830
29-
.. note:: This module requires Python modules ``youtube-dl`` and ``pafy`` (``pip install youtube-dl pafy``).
31+
.. note:: This module requires Python module ``youtube-dl`` (``pip install youtube-dl``).
3032
"""
3133

3234
from __future__ import absolute_import
3335
from __future__ import print_function
36+
import time
3437

3538
from aeneas.logger import Loggable
3639
from aeneas.runtimeconfiguration import RuntimeConfiguration
3740
import aeneas.globalfunctions as gf
3841

3942

43+
class DownloadError(Exception):
44+
"""
45+
Error raised when a given URL is not valid or
46+
it cannot be downloaded because of temporary
47+
network issues.
48+
"""
49+
pass
50+
51+
4052
class Downloader(Loggable):
4153
"""
4254
Download files from various Web sources.
55+
At the moment, only YouTube videos
56+
are officially supported.
4357
4458
:param rconf: a runtime configuration
4559
:type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
@@ -54,9 +68,8 @@ def audio_from_youtube(
5468
source_url,
5569
download=True,
5670
output_file_path=None,
57-
preferred_index=None,
71+
download_format=None,
5872
largest_audio=True,
59-
preferred_format=None
6073
):
6174
"""
6275
Download an audio stream from a YouTube video,
@@ -67,8 +80,8 @@ def audio_from_youtube(
6780
6881
Otherwise, download the audio stream best matching
6982
the provided parameters, as follows.
70-
If ``preferred_index`` is not ``None``,
71-
download the audio stream at that index.
83+
If ``download_format`` is not ``None``,
84+
download the audio stream with the specified format.
7285
If ``largest_audio`` is ``True``,
7386
download the largest audiostream;
7487
otherwise, download the smallest audiostream.
@@ -80,76 +93,150 @@ def audio_from_youtube(
8093
8194
:param string source_url: the URL of the YouTube video
8295
:param bool download: if ``True``, download the audio stream
83-
best matching ``preferred_index`` or ``preferred_format``
84-
and ``largest_audio``;
96+
best matching ``preferred_index`` or
97+
``preferred_format`` and ``largest_audio``;
8598
if ``False``, return the list of available audio streams
8699
:param string output_file_path: the path where the downloaded audio should be saved;
87100
if ``None``, create a temporary file
88-
:param int preferred_index: preferably download this audio stream
101+
:param int download_format: download the audio stream with the given format
89102
:param bool largest_audio: if ``True``, download the largest audio stream available;
90103
if ``False``, download the smallest one.
91-
:param string preferred_format: preferably download this audio format
92-
:rtype: string or list of pafy audio streams
93-
:raises: ImportError: if ``pafy`` is not installed
104+
:rtype: string or list of dict
105+
:raises: ImportError: if ``youtube-dl`` is not installed
94106
:raises: OSError: if ``output_file_path`` cannot be written
95-
:raises: ValueError: if ``source_url`` is not a valid YouTube URL
107+
:raises: :class:`~aeneas.downloader.DownloadError`: if ``source_url`` is not a valid YouTube URL
108+
or it cannot be downloaded e.g. for temporary
109+
network issues
96110
"""
97-
def select_audiostream(audiostreams):
98-
""" Select the audiostream best matching the given parameters. """
99-
if preferred_index is not None:
100-
if preferred_index in range(len(audiostreams)):
101-
self.log([u"Selecting audiostream with index %d", preferred_index])
102-
return audiostreams[preferred_index]
103-
else:
104-
self.log_warn([u"Audio stream index '%d' not allowed", preferred_index])
105-
self.log_warn(u"Ignoring the requested audio stream index")
106-
# selecting by preferred format
107-
streams = audiostreams
108-
if preferred_format is not None:
109-
self.log([u"Selecting audiostreams by preferred format %s", preferred_format])
110-
streams = [audiostream for audiostream in streams if audiostream.extension == preferred_format]
111-
if len(streams) < 1:
112-
self.log([u"No audiostream with preferred format %s", preferred_format])
113-
streams = audiostreams
114-
# sort by size
115-
streams = sorted([(audio.get_filesize(), audio) for audio in streams])
116-
if largest_audio:
117-
self.log(u"Selecting largest audiostream")
118-
selected = streams[-1][1]
119-
else:
120-
self.log(u"Selecting smallest audiostream")
121-
selected = streams[0][1]
111+
112+
def _list_audiostreams(self, source_url):
113+
"""
114+
Return a list of dicts, each describing
115+
an available audiostream for the given ``source_url``.
116+
"""
117+
self.log(u"Getting audiostreams...")
118+
audiostreams = []
119+
options = {
120+
"download": False,
121+
"quiet": True,
122+
"skip_download": True,
123+
}
124+
with youtube_dl.YoutubeDL(options) as ydl:
125+
info = ydl.extract_info(source_url, download=False)
126+
audio_formats = [f for f in info["formats"] if f["vcodec"] == "none" and f["acodec"] != "none"]
127+
for a in audio_formats:
128+
audiostreams.append({
129+
"format": a["format"].split(" ")[0],
130+
"filesize": a["filesize"],
131+
"ext": a["ext"],
132+
"abr": a["abr"]
133+
})
134+
self.log(u"Getting audiostreams... done")
135+
return audiostreams
136+
137+
def _select_audiostream(self, audiostreams, download_format=None, largest_audio=False):
138+
"""
139+
Select the best-matching audiostream:
140+
if a ``download_format`` is given, use it,
141+
otherwise act according to ``largest_audio``.
142+
If ``download_format`` is not matching any
143+
of the available audiostreams, then just act
144+
according to ``largest_audio``.
145+
"""
146+
self.log(u"Selecting best-matching audiostream...")
147+
selected = None
148+
if download_format is not None:
149+
matching = [a for a in audiostreams if a["format"] == download_format]
150+
if len(matching) > 0:
151+
selected = matching[0]
152+
if selected is None:
153+
sa = sorted(audiostreams, key=lambda x: x["filesize"])
154+
selected = sa[-1] if largest_audio else sa[0]
155+
self.log(u"Selecting best-matching audiostream... done")
122156
return selected
123157

124-
try:
125-
import pafy
126-
except ImportError as exc:
127-
self.log_exc(u"Python module pafy is not installed", exc, True, ImportError)
158+
def _compose_output_file_path(self, extension, output_file_path=None):
159+
"""
160+
If ``output_file_path`` is given, use it.
161+
Otherwise (``output_file_path`` is ``None``),
162+
create a temporary file with the correct extension.
163+
"""
164+
self.log(u"Determining output file path...")
165+
if output_file_path is None:
166+
self.log(u"output_file_path is None: creating temp file")
167+
handler, output_file_path = gf.tmp_file(
168+
root=self.rconf[RuntimeConfiguration.TMP_PATH],
169+
suffix=(".%s" % extension)
170+
)
171+
gf.delete_file(handler, output_file_path)
172+
else:
173+
self.log(u"output_file_path is not None: cheking that file can be written")
174+
if not gf.file_can_be_written(output_file_path):
175+
self.log_exc(u"Path '%s' cannot be written. Wrong permissions?" % (output_file_path), None, True, OSError)
176+
self.log(u"Determining output file path... done")
177+
self.log([u"Output file path is '%s'", output_file_path])
178+
return output_file_path
179+
180+
def _download_audiostream(self, source_url, fmt, output_path):
181+
self.log(u"Downloading audiostream...")
182+
options = {
183+
"download": True,
184+
"format": fmt,
185+
"outtmpl": output_path,
186+
"quiet": True,
187+
"skip_download": False,
188+
}
189+
with youtube_dl.YoutubeDL(options) as ydl:
190+
ydl.download([source_url])
191+
self.log(u"Downloading audiostream... done")
128192

129193
try:
130-
video = pafy.new(source_url)
131-
except (IOError, OSError, ValueError) as exc:
132-
self.log_exc(u"The specified source URL '%s' is not a valid YouTube URL or you are offline" % (source_url), exc, True, ValueError)
194+
import youtube_dl
195+
except ImportError as exc:
196+
self.log_exc(u"Python module youtube-dl is not installed", exc, True, ImportError)
197+
198+
# retry parameters
199+
sleep_delay = self.rconf[RuntimeConfiguration.DOWNLOADER_SLEEP]
200+
attempts = self.rconf[RuntimeConfiguration.DOWNLOADER_RETRY_ATTEMPTS]
201+
self.log([u"Sleep delay: %.3f", sleep_delay])
202+
self.log([u"Retry attempts: %d", attempts])
203+
204+
# get audiostreams
205+
att = attempts
206+
while att > 0:
207+
self.log(u"Sleeping to throttle API usage...")
208+
time.sleep(sleep_delay)
209+
self.log(u"Sleeping to throttle API usage... done")
210+
try:
211+
audiostreams = _list_audiostreams(self, source_url)
212+
break
213+
except:
214+
self.log_warn(u"Unable to list audio streams, retry")
215+
att -= 1
216+
if att <= 0:
217+
self.log_exc(u"All downloader requests failed: wrong URL or you are offline", None, True, DownloadError)
133218

134219
if not download:
135-
self.log(u"Returning the list of audio streams")
136-
return video.audiostreams
137-
138-
output_path = output_file_path
139-
if output_file_path is None:
140-
self.log(u"output_path is None: creating temp file")
141-
handler, output_path = gf.tmp_file(root=self.rconf[RuntimeConfiguration.TMP_PATH])
142-
else:
143-
if not gf.file_can_be_written(output_path):
144-
self.log_exc(u"Path '%s' cannot be written. Wrong permissions?" % (output_path), None, True, OSError)
145-
146-
audiostream = select_audiostream(video.audiostreams)
147-
if output_file_path is None:
148-
gf.delete_file(handler, output_path)
149-
output_path += "." + audiostream.extension
150-
151-
self.log([u"output_path is '%s'", output_path])
152-
self.log(u"Downloading...")
153-
audiostream.download(filepath=output_path, quiet=True)
154-
self.log(u"Downloading... done")
220+
self.log(u"Returning list of audiostreams")
221+
return audiostreams
222+
223+
# download the best-matching audiostream
224+
if len(audiostreams) == 0:
225+
self.log_exc(u"No audiostreams available for the provided URL", None, True, OSError)
226+
audiostream = _select_audiostream(self, audiostreams, download_format, largest_audio)
227+
output_path = _compose_output_file_path(self, audiostream["ext"], output_file_path)
228+
att = attempts
229+
while att > 0:
230+
self.log(u"Sleeping to throttle API usage...")
231+
time.sleep(sleep_delay)
232+
self.log(u"Sleeping to throttle API usage... done")
233+
try:
234+
_download_audiostream(self, source_url, audiostream["format"], output_path)
235+
break
236+
except:
237+
self.log_warn(u"Unable to download audio streams, retry")
238+
att -= 1
239+
if att <= 0:
240+
self.log_exc(u"All downloader requests failed: wrong URL or you are offline", None, True, DownloadError)
241+
155242
return output_path

aeneas/runtimeconfiguration.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,29 @@ class RuntimeConfiguration(Configuration):
164164
.. versionadded:: 1.5.1
165165
"""
166166

167+
DOWNLOADER_SLEEP = "downloader_sleep"
168+
"""
169+
Wait this number of seconds before the next HTTP POST request
170+
of the ``Downloader``.
171+
This parameter can be used to throttle the HTTP usage.
172+
It cannot be a negative value.
173+
174+
Default: ``1.000``.
175+
176+
.. versionadded:: 1.7.2
177+
"""
178+
179+
DOWNLOADER_RETRY_ATTEMPTS = "downloader_retry_attempts"
180+
"""
181+
Retry an HTTP POST request generated by the ``Downloader``
182+
for this number of times before giving up.
183+
It must be an integer greater than zero.
184+
185+
Default: ``5``.
186+
187+
.. versionadded:: 1.7.2
188+
"""
189+
167190
DTW_ALGORITHM = "dtw_algorithm"
168191
"""
169192
DTW aligner algorithm.
@@ -906,6 +929,9 @@ class RuntimeConfiguration(Configuration):
906929
(DTW_ALGORITHM, ("stripe", None, [], u"DTW algorithm (stripe, exact)")),
907930
(DTW_MARGIN, ("60.000", TimeValue, [], u"DTW margin, in s")),
908931

932+
(DOWNLOADER_SLEEP, ("1.000", TimeValue, [], u"sleep between Downloader calls, in s")),
933+
(DOWNLOADER_RETRY_ATTEMPTS, (5, int, [], u"number of retries for a failed Downloader call")),
934+
909935
(FFMPEG_PATH, ("ffmpeg", None, [], u"path to ffmpeg executable")), # or a full path like "/usr/bin/ffmpeg"
910936
(FFMPEG_SAMPLE_RATE, (16000, int, [], u"ffmpeg sample rate")),
911937

0 commit comments

Comments
 (0)