<Window x:Class="JmeterParam.MainWindow"
        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
        xmlns:local="clr-namespace:JmeterParam"
        mc:Ignorable="d"
        Title="MainWindow" Height="450" Width="552.119">
    <Grid>
        <Grid.ColumnDefinitions>
            <ColumnDefinition/>
            <ColumnDefinition/>
        </Grid.ColumnDefinitions>
        <Grid.RowDefinitions>
            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>

            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>
            <RowDefinition/>
        </Grid.RowDefinitions>

        <Label Content="リクエスト送信に使用するサーバ台数：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="0"/>
        <TextBox HorizontalAlignment="Left" Text="{Binding ServerCount}" Height="23" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="0" TextAlignment="Right" PreviewTextInput="TextBox_PreviewTextInput"/>

        <Label Content="想定スループット：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="1"/>
        <TextBox HorizontalAlignment="Left" Text="{Binding AssumedThroughput}" Height="23" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="1" TextAlignment="Right" PreviewTextInput="TextBox_PreviewTextInput"/>

        <Label Content="1スレッドグループに含まれるリクエスト数：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="2"/>
        <TextBox HorizontalAlignment="Left" Text="{Binding RequestCountByThreadGroup}" Height="23" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="2" TextAlignment="Right" PreviewTextInput="TextBox_PreviewTextInput"/>

        <Label Content="試験実施時間：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="3"/>
        <TextBox HorizontalAlignment="Left" Text="{Binding StressPeriod}" Height="23" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="3" TextAlignment="Right" PreviewTextInput="TextBox_PreviewTextInput"/>

        <Label Content="想定リクエスト処理時間：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="4"/>
        <TextBox HorizontalAlignment="Left" Text="{Binding AssumedResponceTime}" Height="23" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="4" TextAlignment="Right" PreviewTextInput="TextBox_PreviewTextInput"/>

        <Button x:Name="CalcButton" Grid.Column="0" Grid.ColumnSpan="2" Grid.Row="5" Content="計算" Margin="10,5" Click="CalcButton_Click"/>

        <Label Content="スレッド数：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="7"/>
        <TextBox HorizontalAlignment="Left" Height="23" Text="{Binding ThreadCount}" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="7" IsReadOnly="True" TextAlignment="Right"/>

        <Label Content="ループ回数：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="8"/>
        <TextBox HorizontalAlignment="Left" Height="23" Text="{Binding LoopCount}" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="8" IsReadOnly="True" TextAlignment="Right"/>

        <Label Content="定数スループットタイマー：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="9"/>
        <TextBox HorizontalAlignment="Left" Height="23" Text="{Binding ConstantThroughputTimer}" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="9" IsReadOnly="True" TextAlignment="Right"/>

        <Label Content="総アクセス数：" HorizontalAlignment="Right" VerticalAlignment="Center" Grid.Column="0" Grid.Row="10"/>
        <TextBox HorizontalAlignment="Left" Height="23" Text="{Binding AllAccessCount}" TextWrapping="Wrap" VerticalAlignment="Center" Width="70" Grid.Column="1" Grid.Row="10" IsReadOnly="True" TextAlignment="Right"/>

    </Grid>
</Window>

/// <summary>
/// MainWindow.xaml の相互作用ロジック
/// </summary>
public partial class MainWindow : Window
{
    private class ViewModel: INotifyPropertyChanged
    {
        private int serverCount;
        private int assumedThroughput;
        private int requestCountByThreadGroup;
        private int stressPeriod;
        private double assumedResponceTime;
        private int threadCount;
        private int loopCount;
        private int constantThroughputTimer;
        private int allAccessCount;


        public int ServerCount
        {
            get
            {
                return this.serverCount;
            }
            set
            {
                this.serverCount = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("ServerCount"));
            }
        }

        public int AssumedThroughput
        {
            get
            {
                return this.assumedThroughput;
            }
            set
            {
                this.assumedThroughput = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("AssumedThroughput"));
            }
        }

        public int RequestCountByThreadGroup
        {
            get
            {
                return this.requestCountByThreadGroup;
            }
            set
            {
                this.requestCountByThreadGroup = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("RequestCountByThreadGroup"));
            }
        }

        public int StressPeriod
        {
            get
            {
                return this.stressPeriod;
            }
            set
            {
                this.stressPeriod = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("StressPeriod"));
            }
        }

        public double AssumedResponceTime
        {
            get
            {
                return this.assumedResponceTime;
            }
            set
            {
                this.assumedResponceTime = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("AssumedResponceTime"));
            }
        }

        public int ThreadCount
        {
            get
            {
                return this.threadCount;
            }
            set
            {
                this.threadCount = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("ThreadCount"));
            }
        }

        public int LoopCount
        {
            get
            {
                return this.loopCount;
            }
            set
            {
                this.loopCount = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("LoopCount"));
            }
        }

        public int ConstantThroughputTimer
        {
            get
            {
                return this.constantThroughputTimer;
            }
            set
            {
                this.constantThroughputTimer = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("ConstantThroughputTimer"));
            }
        }

        public int AllAccessCount
        {
            get
            {
                return this.allAccessCount;
            }
            set
            {
                this.allAccessCount = value;
                this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("AllAccessCount"));
            }
        }

        public event PropertyChangedEventHandler PropertyChanged;

        public void UpdateAllAccessCount()
        {
            this.allAccessCount = this.ThreadCount * this.RequestCountByThreadGroup * this.LoopCount * this.ServerCount;
            this.PropertyChanged?.Invoke(this, new PropertyChangedEventArgs("AllAccessCount"));
        }
    }

    public MainWindow()
    {
        this.InitializeComponent();
        this.WindowStartupLocation = WindowStartupLocation.CenterScreen;
        this.WindowStyle = WindowStyle.ToolWindow;

        this.DataContext = this.model;
        this.model.AssumedResponceTime = 1.5;
    }

    private ViewModel model = new ViewModel();

    private void CalcButton_Click(object sender, RoutedEventArgs e)
    {
        if (this.model.ServerCount <= 0)
        {
            MessageBox.Show(this, "サーバ台数は 0 以上");
            return;
        }
        if (this.model.AssumedResponceTime <= 0)
        {
            MessageBox.Show(this, "想定リクエスト処理時間は 0 以上");
            return;
        }
        if (this.model.RequestCountByThreadGroup <= 0)
        {
            MessageBox.Show(this, "リクエスト数は 0 以上");
            return;
        }

        this.model.ThreadCount = (int)Math.Ceiling(this.model.AssumedThroughput / this.model.ServerCount * this.model.AssumedResponceTime);
        this.model.LoopCount = (int)Math.Ceiling(this.model.StressPeriod / this.model.AssumedResponceTime / this.model.RequestCountByThreadGroup);
        this.model.ConstantThroughputTimer = (int)Math.Ceiling(60.0 / this.model.AssumedResponceTime * this.model.ThreadCount);
        this.model.UpdateAllAccessCount();
    }

    private void TextBox_PreviewTextInput(object sender, TextCompositionEventArgs e)
    {
        Regex regex = new Regex("[0-9\\.]+");
        e.Handled = !regex.IsMatch(e.Text);
    }
}

アプリの見た目はこんな感じ

f:id:monakaice88:20190531065350p:plain

おわりに

このパラメータの決め方を使うことで、思った通りの負荷を掛けやすくなった。

負荷がわかりやすくなったので、どのスループット辺りから、WebサーバのCPUパワーが足りなくなりそうかも分かりやすくなった。

Jmeterの定数スループットタイマは使うべき

2019-02-13

Python+Scrapyでスクレイピングした結果をDB(PostgreSQL)に保存する

PostgreSQL Python

はじめに

前に、Scrapyでクローニング・スクレイピングするスクリプトを作成した。

このときは、とあるディレクトリにスクレイピングした結果をjsonにして出力した。

↓そのときの話

monakaice88.hatenablog.com

そして、出力したjsonを別のスクリプトでDBに保存しようかと考えていた。

前回全く使用していなかった「pipelines.py」に処理を追加すると、DBに追加出来ることがわかったので、処理を追加してみた。

はじめに
参考サイト
環境
DBのテーブル作成
スパイダーの作成
DB接続用の設定ファイルを作成
パイプの作成
設定の変更
クローニングの実行
実行結果
おわりに

参考サイト

doc.scrapy.org

環境

PostgreSQL 11
Python 3.6.8
Scrapy 1.5.1
psycopg2
Scrapyで作成したプロジェクト名：MachiMachi

DBのテーブル作成

スクレイピングした結果を保存するテーブルを作成する。

今回は、以下のデータを登録するようにした。

URL【主キー】
ドメイン
登録日時
タイトル
コンテンツ（本文）

SQLはこんな感じ

-- コンテンツ
create table contents (
  url text not null
  , domain_name text not null
  , register_date timestamp with time zone not null
  , title text not null
  , contents text not null
  , constraint contents_PKC primary key (url)
) ;

comment on table contents is 'コンテンツ';
comment on column contents.url is 'URL';
comment on column contents.domain_name is 'ドメイン';
comment on column contents.register_date is '登録日時';
comment on column contents.title is 'タイトル';
comment on column contents.contents is 'コンテンツ';

スパイダーの作成

今回も前回と同様に、「ライフハッカー（日本語版）」と「TechCrunch（日本語版）」をクローニングする。

前に作成したスパイダーと全く同じものを使うので、今回は省略。

DB接続用の設定ファイルを作成

Scrapyで作成したプロジェクトディレクトリにDB接続用の設定ファイル「setting.ini」を作成し、pipelineクラスから使用するようにした

[default]
host=192.168.xx.xx
port=5432
dbname=xxxxxxx
user=xxxxxxx
password=xxxxxxx

パイプの作成

Scrapyのドキュメントを参考に、スクレイピングした結果をDBにインサートする処理を作成

URLを主キーにしているので、既に同じURLが存在している場合は、インサートしないようにしている。

from configparser import ConfigParser
import logging
import psycopg2
import datetime


class MachimachiPipeline(object):

    def __init__(self):
        self.connection = None
        self.cursor = None
        self.register_datetime = datetime.datetime.now()

    def open_spider(self, spider):
        """
        スパイダーが実行されたときに呼ばれる
        :param spider:実行中のスパイダー
        """
        config = ConfigParser()
        config.read('setting.ini')
        host = config.get('default', 'host')
        port = config.get('default', 'port')
        db_name = config.get('default', 'dbname')
        user = config.get('default', 'user')
        password = config.get('default', 'password')

        logging.info('connecting to db.')
        self.connection = psycopg2.connect('host=%s port=%s dbname=%s user=%s password=%s'
                                           % (host, port, db_name, user, password))
        self.connection.autocommit = False
        self.cursor = self.connection.cursor()
        logging.info('connected to db.')

    def close_spider(self, spider):
        """
        スパイダーの処理が終わるときに呼ばれる
        :param spider:処理中のスパイダー
        """
        self.connection.commit()
        self.cursor.close()
        self.connection.close()
        logging.info('close connection to db.')

    def process_item(self, item, spider):
        """
        スクレイピングされた項目毎に呼ばれる
        :param item:スクレイピングで取得したデータ
        :param spider:処理中のスパイダー
        :return:dictデータ
        """
        url = item['url']
        self.cursor.execute('SELECT * FROM contents WHERE (url = %s)', (url,))
        record = self.cursor.fetchone()
        if record is not None:
            logging.info('url is already registered. url:%s' % (url,))
            return item
        else:
            values = (
                item['url'],
                spider.allowed_domains[0],
                self.register_datetime,
                item['title'],
                item['body']
            )
            self.cursor.execute('INSERT INTO contents (url, domain_name, register_date , title, contents)'
                                ' VALUES (%s, %s, %s, %s, %s)', values)
            logging.info('contents is registered!!. url:%s' % (url,))
            return item

設定の変更

Scrapyの設定も変更する。

「settings.py」の中に、パイプラインの設定がコメントアウトされているので、そのコメントアウトを解除する。

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'MachiMachi.pipelines.MachimachiPipeline': 300,
}

クローニングの実行

全スパイダーのクローニングを実行するスクリプトを作成する（前に作成したものを一部修正しただけ）

修正点としては、DBに保存するのでファイル出力をしないようにした。

import subprocess
import multiprocessing


def get_crawler_list():
    process = subprocess.Popen('scrapy list', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout_data, stderr_data = process.communicate()
    if process.returncode == 0:
        strings = stdout_data.decode('utf-8').split('\n')
        return list(filter(None, strings))
    else:
        raise RuntimeError()


def execute_scraping(crawler_name):
    cmd = 'scrapy crawl %s --loglevel=INFO' % (crawler_name,)
    subprocess.call(cmd.split())


def main():
    jobs = []
    for crawler_name in get_crawler_list():
        job = multiprocessing.Process(target=execute_scraping, args=(crawler_name,))
        jobs.append(job)
        job.start()

    [job.join() for job in jobs]

    print('finish !!!!')


if __name__ == '__main__':
    main()

実行結果

クローニングを実行すると、PostgreSQLのテーブルに、以下のようにデータが保存された。（見やすくするため、ドメインとタイトルのみ表示）

select domain_name, title from contents;

    domain_name    |                                                                       title                                                                       
-------------------+-------------------------------------------------------------------------------------------------------------
 www.lifehacker.jp | 孫の世話をする祖父母は長生きする：調査結果
 jp.techcrunch.com | Amazonが家庭用メッシュルーターのEeroを買収してEcho製品拡販のベースに
 jp.techcrunch.com | ジェフ・ベゾスのメッセージ暴露、サウジが関与を否定
 jp.techcrunch.com | 米国のiPhoneユーザーが昨年アプリに使った金額は平均79ドル、前年比36%アップ
 jp.techcrunch.com | GoogleドキュメントのAPIでタスクの自動化が可能に
 jp.techcrunch.com | 無登録物件のリスト掲載でパリ市がAirbnbを告訴
 jp.techcrunch.com | 小売・飲食企業のアプリ開発支援を手がけるエンターモーションが2億円調達
 jp.techcrunch.com | ソフトバンク、自動運転配達のnuroに9.4億ドルを資金提供
 www.lifehacker.jp | 合体と分離ができる3in1トラベルバッグ｢JW Weekender｣を使ってみた
 www.lifehacker.jp | そこまでの生産性、本当に必要ですか？
 www.lifehacker.jp | キッチンに貼るだけで鍋蓋もまな板もスッキリ収納。菜箸置きにも使えそう〜
 www.lifehacker.jp | ｢この1本｣で仕事はもっと上手くいく。ビジネスコーチに教わる缶コーヒー活用術
 www.lifehacker.jp | オフィスとジムで兼用できる！ どんなシーンでも履ける万能シューズがコール ハーンから登場
 www.lifehacker.jp | 贈った方も得をする！ コスパ抜群な｢バレンタインギフト｣を選ぶコツとは？
 www.lifehacker.jp | 元Google人材育成統括部長からのメッセージ｢もうがんばらないでください｣
 www.lifehacker.jp | 子どもに初めてのスマホ、何に注意すべき？ ｢2019年度版チェックリスト｣
 www.lifehacker.jp | 世界的メール配信サービス会社メールチンプのCEO、ベン・チェストナットさんの仕事術
 www.lifehacker.jp | 働き方を変えるには｢短パン｣から？ 全員｢複業｣スタートアップCEOの仕事術
 www.lifehacker.jp | オフシーズンの劇場をレンタルスペース化。アーティスティック・ディレクター、エイドリアンさんの仕事術
 www.lifehacker.jp | 親が子どもに言ってはいけない8つの言葉とは？
 www.lifehacker.jp | 自分が｢敏感すぎる｣｢繊細すぎる｣人かどうかを測る｢HSPセルフチェック｣

おわりに

パイプラインを使い、DBに登録する処理ができた。

ドキュメントには、パイプラインの使用用途として、以下を例に挙げていた

HTMLのクレンジング
データのバリデーション
重複データの削除
データベースへの保存

Scrapyでクローニング・スクレイピングした結果を、他のモジュールで加工・解析・保存をやろうと考えていたけど、パイプラインの組み合わせだけで、テキストマイニングまで出来そう。

2019-02-09

Python3.6+Scrapyでスクレイピングしてみた

Python HTML

はじめに

機械学習について勉強するため、機械学習を使った何かを作ろうと思っている今日このごろ

いくつかサンプルが載っているような本を買っても、サンプルを動かすのはモチベーションが上がらない

やはりモチベーションが上がるものは、自分がやりたいを作るべきだなぁ

自分が機械学習を利用してやりたいことはなんだろうなーと考えた

自分が興味あるものを学習して、コンテンツ（または元のサイトのURL）を配信するものを作ってみたい

もうすでに、公開されているサービスを利用しているけど気にしない（作ることにきっと意味がある）

そんなことで、コンテンツの内容を取得するため、Pythonでスクレイピングをやってみることにした。

昔にスクレイピングをやったことがあるけど、サーバに負荷を掛けないように配慮されたライブラリを探してみた。（昔使っていたのは、beautifulsoup4というライブラリ）

Scrapyというライブラリが、クローリングの際、時間を空けてクローリングができるみたい

なので、今回はScrapyでスクレイピングをやってみた話。

はじめに
参考サイト
Scrapyのインストール
Scrapyを使うための作業
おわりに

参考サイト

note.nkmk.me

dragstar.hatenablog.com

Scrapyのインストール

まず、Scrapyをインストール

pip install scrapy

Scrapyを使うための作業

scrapy startprojectでScrapyのプロジェクト作成
itemsにスプレイピング後のデータ構造を定義
scrapy genspiderでクローリング・スクレイピングをするためのクラス（spider）を作成
settings.pyにクローニング設定を記述
scrapy crawlでクローニングとスクレイピングを実行

チュートリアルは以下のサイト

Scrapy Tutorial — Scrapy 1.6.0 documentation

1. Scrapyのプロジェクト作成

以下のコマンドで、Scrapyのプロジェクト作成を作成する。今回のプロジェクト名はscraperにした

scrapy startproect scraper

このコマンドを実行すると、以下のようなディレクトリ・ファイルが生成される。

scraper/
├── scraper
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── settings.py
│   └── spiders
│       └── __init__.py
└── scrapy.cfg

2. スクレイピング後のデータ定義

スクレイピングしたときに取得する情報を「items.py」に定義する。

今回は、タイトル、本文、スクレイピングしたURLを取得しようと思ったので、以下のような感じになった。

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ScraperItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    body = scrapy.Field()

3. スクレイピングをするためのクラス（spider）を作成

最初のスクレイピングを試すサイトとして、LifeHacker（日本語版）のページでやってみることにした。

作成したプロジェクトフォルダの中で、spiderを作成するコマンドを実行する

cd scraper
scrapy genspider lifehacker www.lifehacker.jp

spidersフォルダの中に新しく「lifehacker.py」が作成されているので、この中にスクレイピングのロジックを作成する

# -*- coding: utf-8 -*-
import scrapy
from scraper.items import ScraperItem


class LifehackerSpider(scrapy.Spider):
    name = 'lifehacker'
    allowed_domains = ['www.lifehacker.jp']
    start_urls = ['https://www.lifehacker.jp/']     # httpをhttpsに変更

    def parse(self, response):
        for content_item in response.css('div.lh-summary'):
            item = ScraperItem()
            href = content_item.css('h3.lh-summary-title a::attr(href)').extract_first()
            title = content_item.css('h3.lh-summary-title a::text').extract_first()
            item['title'] = title
            url = response.urljoin(href)
            item['url'] = url

            yield scrapy.Request(
                url,
                callback=self.parse_detail,
                meta={'item': item}
            )

    @classmethod
    def parse_detail(cls, response):
        item = response.meta['item']
        str_list = response.css('#realEntryBody *::text').extract()
        item['body'] = ''.join(str_list)
        yield item

最初はLifeHacker（日本語版）のTopページにアクセスし、タイル状に並んでいるコンテンツを更にスクレイピングしている。

１つじゃ物足りなので、TechCrunch（日本語版）もスクレイピングするクラスを作成してみた

以下のコマンドで、TechCrunch用のspiderを作成

TechCrunchはRSSがあったので、RSS経由でコンテンツを取得するようにする。

scrapy genspider techcrunch https://jp.techcrunch.com/feed/

新しく作成された「techcrunch.py」にスクレイピングのロジックを作成する。

# -*- coding: utf-8 -*-
import scrapy
from scraper.items import ScraperItem


class TechcrunchSpider(scrapy.Spider):
    name = 'techcrunch'
    allowed_domains = ['jp.techcrunch.com']
    start_urls = ['https://jp.techcrunch.com/feed/']     # httpをhttpsに変更

    def parse(self, response):
        response.selector.remove_namespaces()

        for content_item in response.css('item'):
            item = ScraperItem()

            title = content_item.css('title::text').extract_first()
            link = content_item.css('link::text').extract_first()

            item['title'] = title
            item['url'] = link

            yield scrapy.Request(
                link,
                callback=self.parse_detail,
                meta={'item': item}
            )

    @classmethod
    def parse_detail(cls, response):
        item = response.meta['item']
        str_list = response.css('div.article-entry.text :not(div):not(script):not(style):not(span)::text')\
            .extract()
        item['body'] = ''.join(str_list).strip()
        yield item

4. クローニング設定を記述

スクレイピングの共通な設定は、「settings.py」にあり、以下の項目を設定した。

DOWNLOAD_DELAY = 3
FEED_EXPORT_ENCODING = 'utf-8'

DOWNLOAD_DELAYは、同じWebページ内でのダウンロード待ち時間。

FEED_EXPORT_ENCODINGは、スクレイピングの結果をファイル出力するときのエンコード設定。これを設定せずにファイル出力すると、日本語文字が「\u8a71」みたいな文字になる

5. クローニングの実行

スクレイピングを単体で実行するには、以下のコマンドを実行する。

scrapy crawl <スパイダー名> -o <出力ファイルパス>

なので今回作成した、LifeHackerをスクレイピングする場合は、以下のコマンドになる

scrapy crawl lifehacker -o result.json

スクレイピング中のログを表示したくない場合は、オプションに「--nolog」を追加する

scrapy crawl lifehacker -o result.json --nolog

クローニングの実行結果

LifeHacker（日本語版）をクローニングした結果はコチラ

全部で37のデータが取れたけど、長くなるので一部だけ

[
    {
        "title": "朝の出勤時間を早めると得られる8つのメリット",
        "url": "https://www.lifehacker.jp/2019/02/if-youre-lazy-show-up-early-to-work.html",
        "body": "（中略）"
    },
    {
        "title": "ペットに合った温度にできるホットマット？！防水で、自動電源オフ機能も搭載されてるから安心して使えるよ〜",
        "url": "https://www.lifehacker.jp/2019/02/amazon-pet-heater.html",
        "body": "（中略）"
    },
    {
        "title": "ネットショッピングで衝動買いを防ぐコツ｢曜日を決める｣にある、2つのメリット",
        "url": "https://www.lifehacker.jp/2019/02/pick-a-day-of-the-week-to-do-all-of-your-online-shoppin.html",
        "body": "（中略）"
    }
]

クローニングの一括実行

コマンドライン上でscrapyのオプションを見る限り、作成した全てのspiderを起動する方法が無さそう・・・

なので、作成した全spiderを起動するスクリプトを作成した。

import subprocess
import multiprocessing
import datetime


def get_crawler_list():
    process = subprocess.Popen('scrapy list', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout_data, stderr_data = process.communicate()
    if process.returncode == 0:
        strings = stdout_data.decode('utf-8').split('\n')
        return list(filter(None, strings))
    else:
        raise RuntimeError()


def execute_scraping(crawler_name, execute_time):
    date_str = execute_time.strftime('%Y%m%d%H%M%S')
    output_file_name = '%s_%s.json' % (crawler_name, date_str)
    cmd = 'scrapy crawl %s -o scrape_results/%s --nolog' % (crawler_name, output_file_name)
    subprocess.call(cmd.split())


def main():
    execute_time = datetime.datetime.now()

    jobs = []
    for crawler_name in get_crawler_list():
        job = multiprocessing.Process(target=execute_scraping, args=(crawler_name, execute_time))
        jobs.append(job)
        job.start()

    [job.join() for job in jobs]

    print('finish !!!!')


if __name__ == '__main__':
    main()

やっていることは

コマンドscrapy listでspider名の一覧を取得し、spider名のリスト作成
spiderをマルチスレッドで、それぞれ実行
すべてのクローリングが終わるまで待機

クローニングの実行結果は、scrape_resultsというディレクトリに日時付きのファイル名で出力するようにしている。

おわりに

前にもスクレイピングをするpython スクリプトを作成したことがあったけど、そのときには「同じドメインの場合、時間を空けてアクセス」するロジックが無かった・・・

このScrapyでは、自動でやってくれるのでとても便利

もなかアイスの試食品

「とりあえずやってみたい」そんな気持ちが先走りすぎて挫折が多い私のメモ書きみたいなものです．

負荷テストで使うJmeterのパラメータを決めやすくする

はじめに

おわりに

Python+Scrapyでスクレイピングした結果をDB(PostgreSQL)に保存する

はじめに

参考サイト

環境

DBのテーブル作成

スパイダーの作成

DB接続用の設定ファイルを作成

パイプの作成

設定の変更

クローニングの実行

実行結果

おわりに

Python3.6+Scrapyでスクレイピングしてみた

はじめに

参考サイト

Scrapyのインストール

Scrapyを使うための作業

1. Scrapyのプロジェクト作成

2. スクレイピング後のデータ定義

3. スクレイピングをするためのクラス（spider）を作成

4. クローニング設定を記述

5. クローニングの実行

クローニングの実行結果

クローニングの一括実行

おわりに