第一步,装置 requests-html
- 降级 pip
pip install --upgrade pip
- 降级 urllib3
sudo python3 -m pip install urllib3 --upgrade
- 装置 requests-html
sudo python3 -m pip install requests-html
第 1.1 步,给我的项目,装置 requests-html
- 批改 setup.py 文件,
增加
install_requires=[ 'requests-html', ],
- 批改 launch.json
增加
"pythonPath": "/usr/bin/python3"
- 命令行,装置
sudo python3 -m setup install
- python 文件中,应用
from requests_html import HTMLSession
第 2 步,持续应用 youtube – dl
- 新建一个信息提取类
class XxxIE(InfoExtractor):
- 建设匹配正则
_VALID_URL = r'https?://(?:www\.|m\.)?xxx\.com.+posts?.+'
对应源代码
启动后,
- 先走 YoutubeDL.py 文件的
def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True, force_generic_extractor=False): # ... for ie in ies: if not ie.suitable(url): continue # ...
- 再走 extractor 文件夹下 common.py 文件的
@classmethod def suitable(cls, url): if '_VALID_URL_RE' not in cls.__dict__: cls._VALID_URL_RE = re.compile(cls._VALID_URL) # ...
2.1 剩下的交给
class XxxIE(InfoExtractor):
- 先在 extractor 文件夹下的
extractors.py
中援用一下
- XxxIE 中下载爬取,即可
from requests_html import HTML class XxxIE(InfoExtractor): _GEO_COUNTRIES = ['CN'] IE_NAME = 'xxx: blog' IE_DESC = 'wo qu' _VALID_URL = r'https?://(?:www\.|m\.)?xxx\.com.+posts?.+' _TEMPLATE_URL = '%s://www.xxx.com/%s/posts/%s/' _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">' def _real_extract(self, url): scheme = 'https' if url.startswith('https') else 'http' print("start ya yay ya") print("\n\n\n") self.downloadX(url, 1) small = list(range(2, 20)) for index in small: # ?page=2 src = url + "?page=" + str(index) self.downloadX(src, index) print("\n\n\n") return {} def downloadX(self, src, index): audio_id = 123456 webpage = self._download_webpage(src, audio_id, note='Download sound page for %s' % audio_id, errnote='Unable to get sound page') html = HTML(html=webpage) # print(webpage) jsonElement = html.find('#js-initialData') jsonInfo = jsonElement[0].text jsonX = json.loads(jsonInfo) dic = jsonX['initialState']['entities']['articles'] print("page: " + str(index) + " : ") for k, v in dic.items(): # pprint(v) t = v.get('title') print(t) print("\n")