forked from bhanu-lab/PythonScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNewsPaperDownloaderV2.py
More file actions
42 lines (36 loc) · 1.13 KB
/
NewsPaperDownloaderV2.py
File metadata and controls
42 lines (36 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 4 19:30:46 2019
@author: Rajesh.Gundupalli
"""
import requests
import html5lib
from bs4 import BeautifulSoup
#URL which contain GDrive link of Hindhu Paper
URL = "https://www.bitul.in/epaper/the-hindu/" #"https://www.bitul.in/epaper/indian-express-pdf/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
f=0
#Checking all the hyperlinks in the site until drive link
for link in soup.find_all('a'):
drivelink = link.get('href')
if "http://www.newspapertoday.xyz" in str(drivelink):
print(drivelink)
f=1
break;
elif "drive.google.com" in str(drivelink):
print(drivelink)
break;
#Now AdFree Hindhu News Paper Link is on drivelink
if f==0:
Id = drivelink[32:65]
file_url = "https://drive.google.com/uc?authuser=0&id="+Id+"&export=download"
else:
file_url = drivelink
r = requests.get(file_url, stream = True)
with open("resources/NewsPaper/NewsPaper.pdf","wb") as pdf:
for chunk in r.iter_content(chunk_size=1024):
# writing one chunk at a time to pdf file
if chunk:
pdf.write(chunk)
#File Will be Downloaded in the current folder