How to Run a Python Web Scraper 24/7 on a VPS with Scrapy
Running a web scraper on your laptop means it stops when the lid closes. Running it on a VPS means it collects data continuously — whether you’re sleeping, traveling, or working on something else. A VPS gives your Scrapy spiders a permanent, always-on home with the CPU, bandwidth, and storage to handle industrial-scale data collection.
This guide covers deploying Scrapy on Ubuntu VPS, scheduling automated crawls, handling rate limiting with proxy rotation, storing data to a database, and monitoring spider health.
Step 1: Set Up the VPS Environment
sudo apt update && sudo apt upgrade -y
sudo apt install python3 python3-pip python3-venv git -y
mkdir -p /var/scrapers/myproject
cd /var/scrapers/myproject
python3 -m venv venv
source venv/bin/activate
Step 2: Install Scrapy and Dependencies
pip install scrapy \
scrapy-rotating-proxies \
scrapy-user-agents \
sqlalchemy \
psycopg2-binary \
pymongo \
itemloaders \
python-dotenv
Step 3: Create a Scrapy Project
scrapy startproject myproject .
# Creates: myproject/, scrapy.cfg
Create a spider
nano myproject/spiders/product_spider.py
import scrapy
from itemloaders import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from datetime import datetime
class ProductItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
scraped_at = scrapy.Field()
class ProductSpider(scrapy.Spider):
name = 'products'
# Respect robots.txt and be polite
custom_settings = {
'DOWNLOAD_DELAY': 2, # 2 seconds between requests
'RANDOMIZE_DOWNLOAD_DELAY': True,
'CONCURRENT_REQUESTS': 4,
'ROBOTSTXT_OBEY': True,
}
def start_requests(self):
urls = [
'https://example.com/products/page/1',
'https://example.com/products/page/2',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# Extract product links from listing page
for product_url in response.css('a.product-link::attr(href)').getall():
yield response.follow(product_url, self.parse_product)
# Follow pagination
next_page = response.css('a.next-page::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
def parse_product(self, response):
loader = ItemLoader(item=ProductItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_value('url', response.url)
loader.add_css('title', 'h1.product-title::text')
loader.add_css('price', 'span.price::text')
loader.add_css('description', 'div.product-desc::text')
loader.add_value('scraped_at', datetime.utcnow().isoformat())
yield loader.load_item()
Step 4: Configure Scrapy Settings for Production
nano myproject/settings.py
BOT_NAME = 'myproject'
SPIDER_MODULES = ['myproject.spiders']
# Be a good citizen
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
CONCURRENT_REQUESTS = 8
CONCURRENT_REQUESTS_PER_DOMAIN = 4
# Auto-throttle (automatically adjusts speed based on server response)
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
# Rotate User Agents to appear as different browsers
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
# Retry failed requests
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
# Cache responses during development (disable in production)
# HTTPCACHE_ENABLED = True
# Output items to JSON Lines file
FEEDS = {
'/var/scrapers/data/products_%(time)s.jsonl': {
'format': 'jsonlines',
'encoding': 'utf8',
'overwrite': False,
}
}
# Log level
LOG_LEVEL = 'INFO'
LOG_FILE = '/var/log/scrapy.log'
Step 5: Store Data to PostgreSQL
nano myproject/pipelines.py
import psycopg2
import os
from datetime import datetime
class PostgreSQLPipeline:
def __init__(self):
self.conn = psycopg2.connect(
host=os.getenv('DB_HOST', 'localhost'),
database=os.getenv('DB_NAME', 'scraperdb'),
user=os.getenv('DB_USER', 'scraper'),
password=os.getenv('DB_PASS', 'password')
)
self.cursor = self.conn.cursor()
self.create_table()
def create_table(self):
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id SERIAL PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
price TEXT,
description TEXT,
scraped_at TIMESTAMP DEFAULT NOW()
)
""")
self.conn.commit()
def process_item(self, item, spider):
self.cursor.execute("""
INSERT INTO products (url, title, price, description, scraped_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE SET
price = EXCLUDED.price,
scraped_at = EXCLUDED.scraped_at
""", (
item.get('url'),
item.get('title'),
item.get('price'),
item.get('description'),
item.get('scraped_at')
))
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
Enable in settings.py:
ITEM_PIPELINES = {
'myproject.pipelines.PostgreSQLPipeline': 300,
}
Step 6: Schedule Automated Crawls with Cron
nano /var/scrapers/run-spider.sh
#!/bin/bash
PROJECT_DIR="/var/scrapers/myproject"
LOG_DIR="/var/log/scrapers"
DATE=$(date +%Y-%m-%d_%H-%M)
mkdir -p $LOG_DIR
cd $PROJECT_DIR
source venv/bin/activate
echo "Starting spider at $(date)" >> $LOG_DIR/cron.log
scrapy crawl products \
-s LOG_FILE=$LOG_DIR/products-$DATE.log \
>> $LOG_DIR/cron.log 2>&1
echo "Spider finished at $(date) with exit code $?" >> $LOG_DIR/cron.log
chmod +x /var/scrapers/run-spider.sh
crontab -e
# Run every 6 hours
0 */6 * * * /bin/bash /var/scrapers/run-spider.sh
# Run every day at 3 AM
0 3 * * * /bin/bash /var/scrapers/run-spider.sh
Step 7: IP Rotation with Rotating Proxies
If a site blocks your VPS IP, rotating proxies prevent bans and allow continuous scraping:
pip install scrapy-rotating-proxies
# In settings.py
ROTATING_PROXY_LIST = [
'proxy1.example.com:8080',
'proxy2.example.com:8080',
'user:pass@proxy3.example.com:8080',
]
DOWNLOADER_MIDDLEWARES = {
'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}
VPS.DO’s 30IPs plan gives you 30 US IPv4 addresses — use them as your own proxy pool to rotate across without paying third-party proxy fees:
# Configure multiple outbound IPs on your VPS
# Then use each IP as a proxy endpoint in ROTATING_PROXY_LIST
Step 8: Monitor Spider Health with PM2
sudo npm install -g pm2
# For long-running spiders, wrap in a loop script
nano /var/scrapers/continuous-spider.py
import subprocess
import time
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
def run_spider():
logging.info("Starting spider run...")
result = subprocess.run(
['python', '-m', 'scrapy', 'crawl', 'products'],
capture_output=True, text=True,
cwd='/var/scrapers/myproject'
)
if result.returncode != 0:
logging.error(f"Spider failed: {result.stderr}")
else:
logging.info("Spider run complete")
while True:
run_spider()
logging.info("Waiting 4 hours before next run...")
time.sleep(4 * 3600)
cd /var/scrapers/myproject
source venv/bin/activate
pm2 start /var/scrapers/continuous-spider.py \
--name "product-scraper" \
--interpreter python3
pm2 startup && pm2 save
Step 9: Handle Anti-Scraping Measures
Respect rate limits
Always use AUTOTHROTTLE_ENABLED = True and DOWNLOAD_DELAY. Aggressive scraping risks IP bans and violates server resources.
Handle JavaScript-rendered content
pip install scrapy-playwright
# In settings.py
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
# In spider request:
yield scrapy.Request(url, meta={"playwright": True})
Identify yourself honestly
# In settings.py — be transparent about who you are
USER_AGENT = 'MyDataCollector/1.0 (+https://yourdomain.com/bot-info)'
Final Thoughts
A VPS transforms Scrapy from a development tool into a production data pipeline. Scheduled crawls, persistent databases, proxy rotation, and PM2 process management give you a robust, 24/7 data collection system that runs without intervention. VPS.DO’s USA VPS plans offer the bandwidth, SSD storage, and multiple IPv4 addresses that make large-scale, responsible scraping practical.