85 lines
3.3 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
2020-01-16 01:03:19 -05:00
import re
from .common import InfoExtractor
from ..compat import compat_str
2020-01-16 01:03:19 -05:00
from ..utils import (
try_get,
int_or_none
)
class ThisOldHouseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
2017-07-01 21:11:58 -05:00
'md5': '568acf9ca25a639f0c4ff905826b662f',
'info_dict': {
'id': '2REGtUDQ',
'ext': 'mp4',
'title': 'How to Build a Storage Bench',
'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
'timestamp': 1442548800,
'upload_date': '20150918',
}
2020-01-16 01:03:19 -05:00
}, {
'url': 'https://www.thisoldhouse.com/watch/taking-modern-back-to-future-brookline-mid-century-modern-house',
'md5': '5bff4b17e959527066efba9371bb81ba',
'info_dict': {
'id': '8WrwQuEr',
'ext': 'mp4',
'title': 'Taking Modern Back to the Future | Brookline Mid-Century Modern House',
'description': 'After months of hard work, the lackluster mid-century box is a modern marvel once again. Kevin, Tommy and Richard tour the home and review all the special features that went into this beautiful space Sunil and Neha can now call home.',
'upload_date': '20190624',
'timestamp': 1561397187,
'season_number': 40,
'episode_number': 26
},
}, {
'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
'only_matching': True,
}, {
'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
(r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1',
r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'),
webpage, 'video id', default=None, group='id')
if not video_id:
drupal_settings = self._parse_json(self._search_regex(
r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
webpage, 'drupal settings'), display_id)
video_id = try_get(
drupal_settings, lambda x: x['jwplatform']['video_id'],
compat_str) or list(drupal_settings['comScore'])[0]
2020-01-16 01:03:19 -05:00
series = self._search_regex(
r'(?s)episode-breadcrumb.*?>.*?>(.*?)</a>', webpage,
'series name', default=None)
season_number = int_or_none(self._search_regex(
r'Season (\d+);', webpage, 'season number',
default=None))
episode_number = int_or_none(self._search_regex(
r'Season \d+;[\s\S]*Ep\.(\d+)', webpage, 'episode number',
default=None))
if series:
series = series.replace(' TV', '')
return {
'_type': 'url_transparent',
'id': video_id,
'series': series,
'season_number': season_number,
'episode_number': episode_number,
'url': 'jwplatform:' + video_id,
'ie_key': 'JWPlatform',
}