# Check if Ollama is installed
ollama version
# Expected output: ollama version X.X.X
# Start Ollama (if not running)
ollama serve
# In another terminal, verify connection
curl http://localhost:11434/api/tags
# Expected output: JSON with your installed modelsollama list
# You should see these models:
# qwen3:4b (2.5 GB) β
# llama3.1 (4.9 GB) β
# mistral (4.4 GB) β# Quick test with qwen3:4b (fast)
ollama run qwen3:4b "Summarize this in one sentence: The quick brown fox jumped over the lazy dog."
# Or test llama3.1
ollama run llama3.1 "Summarize this in one sentence: The quick brown fox jumped over the lazy dog."# Create main project directory
mkdir article-extractor
cd article-extractor
# Create subdirectories
mkdir -p src/{services,utils,config,cli}
mkdir -p tests/{unit,integration}
mkdir -p config
mkdir -p output/{articles,images,cache}
mkdir -p docs
mkdir -p logs
# Initialize git
git init# Create package.json
npm init -y
# Or with more options:
npm init
# Project name: article-extractor
# Version: 0.1.0
# Description: Extract and summarize paywalled articles
# Entry point: dist/index.js
# Test command: jest
# License: MIT# Browser automation
npm install puppeteer
# HTTP client (for Ollama)
npm install axios
# CLI framework
npm install commander chalk ora
# Markdown processing
npm install turndown gray-matter
# Image processing
npm install sharp
# Utilities
npm install dotenv zod lodash
# Development dependencies
npm install -D typescript @types/node @types/puppeteer ts-node tsconfig-paths
npm install -D eslint prettier
npm install -D jest @types/jest ts-jest
npm install -D @typescript-eslint/eslint-plugin @typescript-eslint/parser# Generate tsconfig.json
npx tsc --init
# Then edit tsconfig.json with:
{
"compilerOptions": {
"target": "ES2020",
"module": "ESNext",
"lib": ["ES2020"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"declaration": true,
"declarationMap": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "tests", "dist"]
}config/default.json:
{
"browser": {
"timeout": 30000,
"headless": true,
"antiDetection": true,
"retries": 3
},
"llm": {
"type": "ollama",
"baseUrl": "http://localhost:11434",
"primaryModel": "llama3.1",
"fallbackModels": ["mistral", "qwen3:4b"],
"summaryLength": "medium",
"temperature": 0.5,
"timeout": 120000,
"streaming": true,
"caching": true,
"cacheDir": "./cache/summaries"
},
"images": {
"maxWidth": 1200,
"quality": 80,
"maxConcurrent": 5,
"timeout": 10000
},
"output": {
"baseDir": "./output/articles",
"structure": "date/publication",
"naming": "slug",
"createSubfolders": true,
"deduplication": "url-hash"
},
"logging": {
"level": "info",
"format": "text",
"file": "./logs/app.log"
}
}.env.example:
# Ollama Configuration
OLLAMA_BASE_URL=http://localhost:11434
OLLAMA_PRIMARY_MODEL=llama3.1
OLLAMA_TIMEOUT=120000
# Browser Configuration
BROWSER_HEADLESS=true
BROWSER_TIMEOUT=30000
# Output Configuration
OUTPUT_DIR=./output/articles
CACHE_DIR=./cache
# Logging
LOG_LEVEL=info# Dependencies
node_modules/
package-lock.json
yarn.lock
# Build
dist/
*.tsbuildinfo
# Environment
.env
.env.local
# Output
output/
cache/
logs/
# IDE
.vscode/
.idea/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Cookies (security)
cookies.json
cookies.*.json
{
"scripts": {
"build": "tsc",
"dev": "ts-node src/index.ts",
"start": "node dist/index.js",
"test": "jest",
"test:watch": "jest --watch",
"lint": "eslint src --ext .ts",
"format": "prettier --write src/**/*.ts",
"check-ollama": "ts-node src/utils/check-ollama.ts",
"extract": "ts-node src/cli/index.ts extract",
"batch": "ts-node src/cli/index.ts batch"
},
"keywords": ["article", "extractor", "summarizer", "paywall", "ollama"],
"author": "Your Name",
"license": "MIT"
}src/utils/check-ollama.ts:
import axios from 'axios';
async function checkOllamaHealth() {
try {
const baseUrl = process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
const response = await axios.get(`${baseUrl}/api/tags`, { timeout: 5000 });
const models = response.data.models || [];
console.log('β
Ollama is running');
console.log(`\nInstalled models (${models.length}):`);
models.forEach((model: any) => {
const name = model.name;
const size = (model.size / 1024 / 1024 / 1024).toFixed(2);
console.log(` - ${name} (${size} GB)`);
});
// Check for recommended models
const modelNames = models.map((m: any) => m.name);
const recommended = ['llama3.1', 'mistral', 'qwen3:4b'];
const missing = recommended.filter(m => !modelNames.some(name => name.includes(m)));
if (missing.length > 0) {
console.log(`\nβ οΈ Missing recommended models: ${missing.join(', ')}`);
console.log('\nTo install, run:');
missing.forEach(m => console.log(` ollama pull ${m}`));
}
} catch (error) {
console.error('β Ollama is not running or not accessible');
console.error(' Please start Ollama with: ollama serve');
process.exit(1);
}
}
checkOllamaHealth();npm run check-ollama- Ollama installed and running
- Your 3 models confirmed installed (qwen3:4b, llama3.1, mistral)
- Node.js 18+ installed
- Project directory created
- package.json initialized
- Dependencies installed
- TypeScript configured
- Configuration files created
- .gitignore created
- Ollama health check passes
- Scripts added to package.json
Once setup is complete:
# Verify everything works
npm run check-ollama
# Build the project
npm run build
# Run in development
npm run dev
# Extract a single article
npm run extract -- https://example.com/article --cookies cookies.json
# Help menu
npm run extract -- --help- CPU: Multi-core processor (4+ cores recommended)
- RAM: 8GB minimum, 16GB recommended
- GPU: Optional but recommended (NVIDIA or AMD for Ollama)
- Storage:
- Ollama models: ~12-15GB
- Output cache: ~1-2GB per 100 articles
- Network: Internet for initial model downloads only
# Check available memory
free -h # Linux
vm_stat # macOS
wmic OS get TotalVisibleMemorySize # Windows
# Check disk space
df -h # Linux/macOS
dir C:\ # Windows
# Verify GPU (optional)
nvidia-smi # NVIDIA GPUs
rocm-smi # AMD GPUs- Install from https://ollama.ai
- Restart terminal
- Check PATH:
echo $PATH
β "Cannot connect to http://localhost:11434"
- Start Ollama:
ollama serve - Check it's listening:
netstat -an | grep 11434 - May need to allow firewall access
- List models:
ollama list - Download missing:
ollama pull llama3.1 - Wait for download to complete
- Use smaller model: qwen3:4b instead of llama3.1
- Close other applications
- Check GPU VRAM:
nvidia-smi
- Clear cache:
npm cache clean --force - Delete node_modules:
rm -rf node_modules - Reinstall:
npm install - Check Node version:
node --version(need 18+)
- Complete this checklist
- Follow TASKS.md starting with Task 1.1
- Implement Phase 1 (Foundation & Infrastructure)
- Test OllamaClient implementation
- Then proceed to Phase 2 (Content Extraction)
article-extractor/
βββ src/
β βββ services/
β β βββ CookieManager.ts
β β βββ BrowserEngine.ts
β β βββ OllamaClient.ts (NEXT TO IMPLEMENT)
β β βββ ... (other services)
β βββ cli/
β β βββ index.ts
β βββ utils/
β β βββ check-ollama.ts
β βββ config/
β β βββ index.ts
β βββ index.ts
βββ tests/
β βββ unit/
β βββ integration/
β βββ fixtures/
βββ config/
β βββ default.json
β βββ development.json
β βββ .env.example
βββ output/
β βββ articles/
β βββ images/
β βββ cache/
βββ docs/
βββ logs/
βββ .gitignore
βββ tsconfig.json
βββ jest.config.js
βββ package.json
β All checks pass:
β Ollama running at http://localhost:11434
β 3+ models installed
β Node.js 18+ available
β npm dependencies installed
β TypeScript configured
β Configuration files created
β Scripts workingYou're now ready to start implementing! π