From f321447fea520b7a9c0b2c52a8e75acb85d66147 Mon Sep 17 00:00:00 2001 From: ayxan Date: Thu, 8 Sep 2022 21:32:42 +0400 Subject: [PATCH] Added SqlitePostPipeline --- .gitignore | 3 +- src/post_scrape/pipelines.py | 111 +++++++++++++++++++++++++- src/post_scrape/settings.py | 8 +- src/post_scrape/spiders/car_spider.py | 4 +- 4 files changed, 116 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index eb383b5..a90ba8a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ .venv cars.jl -cars.csv \ No newline at end of file +cars.csv +cars.db \ No newline at end of file diff --git a/src/post_scrape/pipelines.py b/src/post_scrape/pipelines.py index b604660..e2d9714 100644 --- a/src/post_scrape/pipelines.py +++ b/src/post_scrape/pipelines.py @@ -1,6 +1,113 @@ -from itemadapter import ItemAdapter +import sqlite3 -class PostScrapePipeline: +class SqlitePostPipeline: + def __init__(self): + self.id = 1 + self.con = sqlite3.connect('cars.db') + self.cur = self.con.cursor() + + self.cur.execute(""" + CREATE TABLE IF NOT EXISTS cars( + id INTEGER, + url TEXT, + avto_salon TEXT, + description TEXT, + city TEXT, + brand TEXT, + model TEXT, + year INTEGER, + category TEXT, + color TEXT, + engine_volume INTEGER, + engine_power INTEGER, + fuel_type TEXT, + mileage INTEGER, + mileage_type TEXT, + transmission TEXT, + gear TEXT, + price INTEGER, + currency TEXT, + loan TEXT, + barter TEXT, + market TEXT, + seats_count TEXT, + prior_owners_count TEXT, + crashed TEXT, + painted TEXT + ) + """) + # IMAGE + self.cur.execute(""" + CREATE TABLE IF NOT EXISTS images( + post_id INTEGER, + url TEXT + ) + """) + # PHONE + self.cur.execute(""" + CREATE TABLE IF NOT EXISTS phones( + post_id INTEGER, + phone TEXT + ) + """) + # EXTRA_FIELDS + self.cur.execute(""" + CREATE TABLE IF NOT EXISTS extra_fields( + post_id INTEGER, + extra_field TEXT + ) + """) + + self.cur.execute("SELECT * FROM cars ORDER BY id DESC LIMIT 1") + result = self.cur.fetchone() + if result is not None: self.id = result[0] + 1 + def process_item(self, item, spider): + self.cur.execute(""" + INSERT INTO cars (id, url, avto_salon, description, city, brand, model, year, category, color, engine_volume, engine_power, fuel_type, mileage, mileage_type, transmission, gear, price, currency, loan, barter, market, seats_count, prior_owners_count, crashed, painted) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + self.id, + item['url'], + item['avto_salon'], + ' '.join(item['description']), + item['city'], + item['brand'], + item['model'], + item['year'], + item['category'], + item['color'], + item['engine_volume'], + item['engine_power'], + item['fuel_type'], + item['mileage'], + item['mileage_type'], + item['transmission'], + item['gear'], + item['price'], + item['currency'], + item['loan'], + item['barter'], + item['market'], + item['seats_count'], + item['prior_owners_count'], + item['crashed'], + item['painted'] + )) + # IMAGE + for image in item['images']: + self.cur.execute("""INSERT INTO images (post_id, url) VALUES (?, ?)""", + (self.id, image)) + # PHONE + for phone in item['phone']: + self.cur.execute("""INSERT INTO phones (post_id, phone) VALUES (?, ?)""", + (self.id, phone)) + # EXTRA_FIELDS + for field in item['extra_fields']: + self.cur.execute("""INSERT INTO extra_fields (post_id, extra_field) VALUES (?, ?)""", + (self.id, field)) + + self.con.commit() + self.id += 1 return item diff --git a/src/post_scrape/settings.py b/src/post_scrape/settings.py index 71a92a5..13ad3bc 100644 --- a/src/post_scrape/settings.py +++ b/src/post_scrape/settings.py @@ -60,11 +60,9 @@ ROBOTSTXT_OBEY = True # 'scrapy.extensions.telnet.TelnetConsole': None, #} -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'post_scrape.pipelines.PostScrapePipeline': 300, -#} +# ITEM_PIPELINES = { +# 'post_scrape.pipelines.SqlitePostPipeline': 300, +# } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html diff --git a/src/post_scrape/spiders/car_spider.py b/src/post_scrape/spiders/car_spider.py index 7b1fd92..6fb2d03 100644 --- a/src/post_scrape/spiders/car_spider.py +++ b/src/post_scrape/spiders/car_spider.py @@ -6,8 +6,8 @@ class CarDataSpider(Spider): name = "turbo.az" allowed_domains = ('turbo.az',) headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36' - } + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36' + } def __init__(self, first_page=1, last_page=2, *args, **kwargs): super(CarDataSpider, self).__init__(*args, **kwargs)