From 2ad5a9d621614cbdd8fa5c7eab64b67557e86655 Mon Sep 17 00:00:00 2001 From: Yung-Chung Ku Date: Mon, 19 Aug 2019 16:37:35 +0800 Subject: [PATCH] Update rssfdw.py 1. Skip ssl verify when connecting to https rss 2. Add headers with User-Agent to satisfy some request checking --- python/multicorn/rssfdw.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/python/multicorn/rssfdw.py b/python/multicorn/rssfdw.py index ca3bf4517..0305b0e7c 100644 --- a/python/multicorn/rssfdw.py +++ b/python/multicorn/rssfdw.py @@ -83,13 +83,13 @@ from datetime import datetime, timedelta from lxml import etree try: - from urllib.request import urlopen + from urllib.request import urlopen, Request except ImportError: - from urllib import urlopen + from urllib import urlopen, Request from logging import ERROR, WARNING from multicorn.utils import log_to_postgres import json - +import ssl def element_to_dict(element): """ @@ -140,7 +140,7 @@ def __init__(self, options, columns): self.columns = columns self.default_namespace_prefix = options.pop( 'default_namespace_prefix', None) - self.item_root = options.pop('item_root', 'item') + self.item_root = options.pop('item_root', 'entry') def get_namespaces(self, xml): ns = dict(xml.nsmap) @@ -175,7 +175,14 @@ def execute(self, quals, columns): if (datetime.now() - date) < self.cache_duration: return values try: - xml = etree.fromstring(urlopen(self.url).read()) + ssl._create_default_https_context = ssl._create_unverified_context + req = Request( + url = self.url, + data=None, + headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'} + ) + s = urlopen(req).read() + xml = etree.fromstring(s) items = [self.make_item_from_xml(elem) for elem in xml.xpath( '//%s' % self.item_root,