How to Run a Python Web Scraper 24/7 on a VPS with Scrapy

How to Run a Python Web Scraper 24/7 on a VPS with Scrapy

Running a web scraper on your laptop means it stops when the lid closes. Running it on a VPS means it collects data continuously — whether you’re sleeping, traveling, or working on something else. A VPS gives your Scrapy spiders a permanent, always-on home with the CPU, bandwidth, and storage to handle industrial-scale data collection.

This guide covers deploying Scrapy on Ubuntu VPS, scheduling automated crawls, handling rate limiting with proxy rotation, storing data to a database, and monitoring spider health.

Step 1: Set Up the VPS Environment

sudo apt update && sudo apt upgrade -y
sudo apt install python3 python3-pip python3-venv git -y

mkdir -p /var/scrapers/myproject
cd /var/scrapers/myproject
python3 -m venv venv
source venv/bin/activate

Step 2: Install Scrapy and Dependencies

pip install scrapy \
            scrapy-rotating-proxies \
            scrapy-user-agents \
            sqlalchemy \
            psycopg2-binary \
            pymongo \
            itemloaders \
            python-dotenv

Step 3: Create a Scrapy Project

scrapy startproject myproject .
# Creates: myproject/, scrapy.cfg

Create a spider

nano myproject/spiders/product_spider.py
import scrapy
from itemloaders import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from datetime import datetime

class ProductItem(scrapy.Item):
    url         = scrapy.Field()
    title       = scrapy.Field()
    price       = scrapy.Field()
    description = scrapy.Field()
    scraped_at  = scrapy.Field()

class ProductSpider(scrapy.Spider):
    name = 'products'

    # Respect robots.txt and be polite
    custom_settings = {
        'DOWNLOAD_DELAY': 2,          # 2 seconds between requests
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'CONCURRENT_REQUESTS': 4,
        'ROBOTSTXT_OBEY': True,
    }

    def start_requests(self):
        urls = [
            'https://example.com/products/page/1',
            'https://example.com/products/page/2',
        ]
        for url in urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        # Extract product links from listing page
        for product_url in response.css('a.product-link::attr(href)').getall():
            yield response.follow(product_url, self.parse_product)

        # Follow pagination
        next_page = response.css('a.next-page::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

    def parse_product(self, response):
        loader = ItemLoader(item=ProductItem(), response=response)
        loader.default_output_processor = TakeFirst()

        loader.add_value('url',        response.url)
        loader.add_css('title',        'h1.product-title::text')
        loader.add_css('price',        'span.price::text')
        loader.add_css('description',  'div.product-desc::text')
        loader.add_value('scraped_at', datetime.utcnow().isoformat())

        yield loader.load_item()

Step 4: Configure Scrapy Settings for Production

nano myproject/settings.py
BOT_NAME = 'myproject'
SPIDER_MODULES = ['myproject.spiders']

# Be a good citizen
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
CONCURRENT_REQUESTS = 8
CONCURRENT_REQUESTS_PER_DOMAIN = 4

# Auto-throttle (automatically adjusts speed based on server response)
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0

# Rotate User Agents to appear as different browsers
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}

# Retry failed requests
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]

# Cache responses during development (disable in production)
# HTTPCACHE_ENABLED = True

# Output items to JSON Lines file
FEEDS = {
    '/var/scrapers/data/products_%(time)s.jsonl': {
        'format': 'jsonlines',
        'encoding': 'utf8',
        'overwrite': False,
    }
}

# Log level
LOG_LEVEL = 'INFO'
LOG_FILE = '/var/log/scrapy.log'

Step 5: Store Data to PostgreSQL

nano myproject/pipelines.py
import psycopg2
import os
from datetime import datetime

class PostgreSQLPipeline:
    def __init__(self):
        self.conn = psycopg2.connect(
            host=os.getenv('DB_HOST', 'localhost'),
            database=os.getenv('DB_NAME', 'scraperdb'),
            user=os.getenv('DB_USER', 'scraper'),
            password=os.getenv('DB_PASS', 'password')
        )
        self.cursor = self.conn.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS products (
                id          SERIAL PRIMARY KEY,
                url         TEXT UNIQUE,
                title       TEXT,
                price       TEXT,
                description TEXT,
                scraped_at  TIMESTAMP DEFAULT NOW()
            )
        """)
        self.conn.commit()

    def process_item(self, item, spider):
        self.cursor.execute("""
            INSERT INTO products (url, title, price, description, scraped_at)
            VALUES (%s, %s, %s, %s, %s)
            ON CONFLICT (url) DO UPDATE SET
                price      = EXCLUDED.price,
                scraped_at = EXCLUDED.scraped_at
        """, (
            item.get('url'),
            item.get('title'),
            item.get('price'),
            item.get('description'),
            item.get('scraped_at')
        ))
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

Enable in settings.py:

ITEM_PIPELINES = {
    'myproject.pipelines.PostgreSQLPipeline': 300,
}

Step 6: Schedule Automated Crawls with Cron

nano /var/scrapers/run-spider.sh
#!/bin/bash
PROJECT_DIR="/var/scrapers/myproject"
LOG_DIR="/var/log/scrapers"
DATE=$(date +%Y-%m-%d_%H-%M)

mkdir -p $LOG_DIR

cd $PROJECT_DIR
source venv/bin/activate

echo "Starting spider at $(date)" >> $LOG_DIR/cron.log

scrapy crawl products \
  -s LOG_FILE=$LOG_DIR/products-$DATE.log \
  >> $LOG_DIR/cron.log 2>&1

echo "Spider finished at $(date) with exit code $?" >> $LOG_DIR/cron.log
chmod +x /var/scrapers/run-spider.sh

crontab -e
# Run every 6 hours
0 */6 * * * /bin/bash /var/scrapers/run-spider.sh

# Run every day at 3 AM
0 3 * * * /bin/bash /var/scrapers/run-spider.sh

Step 7: IP Rotation with Rotating Proxies

If a site blocks your VPS IP, rotating proxies prevent bans and allow continuous scraping:

pip install scrapy-rotating-proxies
# In settings.py
ROTATING_PROXY_LIST = [
    'proxy1.example.com:8080',
    'proxy2.example.com:8080',
    'user:pass@proxy3.example.com:8080',
]

DOWNLOADER_MIDDLEWARES = {
    'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
    'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}

VPS.DO’s 30IPs plan gives you 30 US IPv4 addresses — use them as your own proxy pool to rotate across without paying third-party proxy fees:

# Configure multiple outbound IPs on your VPS
# Then use each IP as a proxy endpoint in ROTATING_PROXY_LIST

Step 8: Monitor Spider Health with PM2

sudo npm install -g pm2

# For long-running spiders, wrap in a loop script
nano /var/scrapers/continuous-spider.py
import subprocess
import time
import logging

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

def run_spider():
    logging.info("Starting spider run...")
    result = subprocess.run(
        ['python', '-m', 'scrapy', 'crawl', 'products'],
        capture_output=True, text=True,
        cwd='/var/scrapers/myproject'
    )
    if result.returncode != 0:
        logging.error(f"Spider failed: {result.stderr}")
    else:
        logging.info("Spider run complete")

while True:
    run_spider()
    logging.info("Waiting 4 hours before next run...")
    time.sleep(4 * 3600)
cd /var/scrapers/myproject
source venv/bin/activate

pm2 start /var/scrapers/continuous-spider.py \
  --name "product-scraper" \
  --interpreter python3

pm2 startup && pm2 save

Step 9: Handle Anti-Scraping Measures

Respect rate limits

Always use AUTOTHROTTLE_ENABLED = True and DOWNLOAD_DELAY. Aggressive scraping risks IP bans and violates server resources.

Handle JavaScript-rendered content

pip install scrapy-playwright

# In settings.py
DOWNLOAD_HANDLERS = {
    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}

# In spider request:
yield scrapy.Request(url, meta={"playwright": True})

Identify yourself honestly

# In settings.py — be transparent about who you are
USER_AGENT = 'MyDataCollector/1.0 (+https://yourdomain.com/bot-info)'

Final Thoughts

A VPS transforms Scrapy from a development tool into a production data pipeline. Scheduled crawls, persistent databases, proxy rotation, and PM2 process management give you a robust, 24/7 data collection system that runs without intervention. VPS.DO’s USA VPS plans offer the bandwidth, SSD storage, and multiple IPv4 addresses that make large-scale, responsible scraping practical.

Fast • Reliable • Affordable VPS - DO It Now!

Get top VPS hosting with VPS.DO’s fast, low-cost plans. Try risk-free with our 7-day no-questions-asked refund and start today!