Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
FROM python:3.10.2-alpine3.15
COPY . .
# Install Postgres
RUN apk update
RUN apk add postgresql
RUN chown postgres:postgres /run/postgresql/
# Install requirements
COPY ./requirements.txt /tmp
RUN pip install -r /tmp/requirements.txt
# For psycopg2
RUN apk add --virtual postgresql-deps libpq-dev
# Create directories
RUN mkdir -p /root/workspace/src
# Mount your local file
COPY ./web_scraping_sample.py /root/workspace/src
# Switch to project directory
WORKDIR /root/workspace/src
# Use the official Python image
FROM python:3.9-slim

# Set the working directory
WORKDIR /app

# Copy the requirements file
COPY requirements.txt .

# Install the required Python packages
RUN pip install --no-cache-dir -r requirements.txt

# Copy the Python script
COPY scraper.py .

# Run the Python script
CMD ["python", "scraper.py"]
26 changes: 0 additions & 26 deletions docker-compose.yaml

This file was deleted.

35 changes: 35 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
version: '3.8'

services:
db:
image: postgres:13
container_name: postgres_container
environment:
POSTGRES_DB: blogdata
POSTGRES_USER: postgres
POSTGRES_PASSWORD: password
networks:
- blog_net
volumes:
- pgdata:/var/lib/postgresql/data

scraper:
build: .
container_name: scraper_container
environment:
DB_NAME: blogdata
DB_USER: postgres
DB_PASSWORD: password
DB_HOST: db
DB_PORT: 5432
depends_on:
- db
networks:
- blog_net

networks:
blog_net:
external: true

volumes:
pgdata:
6 changes: 2 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
psycopg2==2.9.3
bs4
urllib2
requests
html5lib==1.1
beautifulsoup4
psycopg2-binary
97 changes: 97 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import requests
from bs4 import BeautifulSoup
import psycopg2
import os
import time

def scrape_blog():
base_url = "https://blog.python.org/"
current_url = base_url
all_posts = []

while True:
response = requests.get(current_url)
if response.status_code != 200:
print(f"Failed to retrieve the page. Status code: {response.status_code}")
break

soup = BeautifulSoup(response.content, 'html.parser')
posts = soup.find_all('div', class_='date-outer')

for post in posts:
title_tag = post.find('h3', class_='post-title')
date_tag = post.find('h2', class_='date-header')
content_tag = post.find('div', class_='post-body')
author_tag = post.find('span', class_='fn')
if title_tag and date_tag and content_tag:
title = title_tag.get_text(strip=True)
date = date_tag.get_text(strip=True)
content = content_tag.get_text(strip=True)
else:
print("Skipping incomplete post")
continue
author = author_tag.get_text(strip=True) if author_tag else 'Unknown'
all_posts.append({
'title': title,
'date': date,
'author': author,
'content': content
})
older_posts_link = soup.find('a', {'class': 'blog-pager-older-link'})
if older_posts_link:
current_url = older_posts_link['href']
else:
break
return all_posts

def save_to_postgres(blog_posts):
# PostgreSQL connection details
DB_NAME = os.getenv("DB_NAME", "blogdata")
DB_USER = os.getenv("DB_USER", "postgres")
DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
DB_HOST = os.getenv("DB_HOST", "db")
DB_PORT = os.getenv("DB_PORT", "5432")

time.sleep(10)

conn = psycopg2.connect(
dbname=DB_NAME,
user=DB_USER,
password=DB_PASSWORD,
host=DB_HOST,
port=DB_PORT
)


cur = conn.cursor()


cur.execute("""
CREATE TABLE IF NOT EXISTS blog_posts (
id SERIAL PRIMARY KEY,
date TEXT,
title TEXT,
author TEXT,
content TEXT
);
""")


for post in blog_posts:
cur.execute(
"INSERT INTO blog_posts (date, title, author, content) VALUES (%s, %s, %s, %s)",
(post['date'], post['title'], post['author'], post['content'])
)


conn.commit()
cur.close()
conn.close()
print("Data has been successfully written to the PostgreSQL database")

def main():
blog_posts = scrape_blog()
save_to_postgres(blog_posts)

if __name__ == "__main__":
main()
31 changes: 0 additions & 31 deletions web_scraping_sample.py

This file was deleted.