Deadline: (.*?)
') +deadline = deadline_.findall(r.text)[0] +print(deadline) + +cost_ = re.compile('Eligibility
(.*?)') +cost = cost_.findall(r.text) +print(cost) + +# eligibility_ = re.compile('') +# eligibility = eligibility_.search(r.content) + +# application_ = re.compile('') +# application = application_.search(r.content) + +# benefits_ = re.compile('') +# benefits = benefits_.search(r.content) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/youthop.py b/youthop.py old mode 100644 new mode 100755 index 18995ca..a05c997 --- a/youthop.py +++ b/youthop.py @@ -9,27 +9,33 @@ csv_writer = csv.writer(csv_file) csv_writer.writerow(['headline', 'summary', 'Apply link', 'Website link']) -for i in range(1, 7): - code = requests.get('https://www.youthop.com/workshops/page/' + str(i)) - soup = BeautifulSoup(code.text, 'html.parser') - for para in soup.find_all('div', class_='post-header'): - _a = para.a.get('href') - code = requests.get(_a) - soup = BeautifulSoup(code.text, 'html.parser') - - # article headline - article_headline = soup.find(id="main") - headline = article_headline.h1.text - headline = re.sub(r'[^\x00-\x7F]+', ' ', headline) - print(headline) - - # article summary paragraph +code = requests.get('https://www.youthop.com/workshops/page/1') +soup = BeautifulSoup(code.text, 'html.parser') + +# article headline +article_headline = soup.find(id="main") +headline = article_headline.h1.text +headline = re.sub(r'[^\x00-\x7F]+', ' ', headline) +print(headline) + +# article summary paragraph +try: article_para = soup.find('div', class_='article-content') # opp paragraph summary = article_para.p.text summary = re.sub(r'[^\x00-\x7F]+', ' ', summary) print(summary) - - # article apply and official link +except AttributeError as e: + print( + "OOPS paragraph") + print(str(e)) + +except UnicodeEncodeError as e: + print( + "OOPS paragraph") + print(str(e)) + +# article apply and official link +try: all_link = soup.find('div', class_='application-process') # apply now link _a_list = all_link.find_all('a') @@ -39,6 +45,16 @@ web_link = _a_list[1].get('href') print(web_link) - csv_writer.writerow([headline, summary, apply_link, web_link]) +except AttributeError as e: + print( + "OOPS link") + print(str(e)) + +except UnicodeEncodeError as e: + print( + "OOPS link") + print(str(e)) + +csv_writer.writerow([headline, summary, apply_link, web_link]) csv_file.close() diff --git a/youthopscrape.py b/youthopscrape.py old mode 100644 new mode 100755 index 5005de7..dda6b1f --- a/youthopscrape.py +++ b/youthopscrape.py @@ -9,11 +9,12 @@ csv_writer = csv.writer(csv_file) csv_writer.writerow(['Image Link', 'headline', 'summary', 'Website link', 'Deadline']) -for i in range(1, 5): +for i in range(1, 2): code = requests.get('https://www.youthop.com/exchange-programs/page/' + str(i)) soup = BeautifulSoup(code.text, 'html.parser') for para in soup.find_all('div', class_='post-header'): + _a = para.a.get('href') code = requests.get(_a) soup = BeautifulSoup(code.text, 'html.parser')