mirror of
https://github.com/aykhans/PostScrape.git
synced 2025-04-08 11:54:01 +00:00
Added SqlitePostPipeline
This commit is contained in:
parent
fa1eba8e49
commit
f321447fea
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
__pycache__
|
||||
.venv
|
||||
cars.jl
|
||||
cars.csv
|
||||
cars.csv
|
||||
cars.db
|
@ -1,6 +1,113 @@
|
||||
from itemadapter import ItemAdapter
|
||||
import sqlite3
|
||||
|
||||
|
||||
class PostScrapePipeline:
|
||||
class SqlitePostPipeline:
|
||||
def __init__(self):
|
||||
self.id = 1
|
||||
self.con = sqlite3.connect('cars.db')
|
||||
self.cur = self.con.cursor()
|
||||
|
||||
self.cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS cars(
|
||||
id INTEGER,
|
||||
url TEXT,
|
||||
avto_salon TEXT,
|
||||
description TEXT,
|
||||
city TEXT,
|
||||
brand TEXT,
|
||||
model TEXT,
|
||||
year INTEGER,
|
||||
category TEXT,
|
||||
color TEXT,
|
||||
engine_volume INTEGER,
|
||||
engine_power INTEGER,
|
||||
fuel_type TEXT,
|
||||
mileage INTEGER,
|
||||
mileage_type TEXT,
|
||||
transmission TEXT,
|
||||
gear TEXT,
|
||||
price INTEGER,
|
||||
currency TEXT,
|
||||
loan TEXT,
|
||||
barter TEXT,
|
||||
market TEXT,
|
||||
seats_count TEXT,
|
||||
prior_owners_count TEXT,
|
||||
crashed TEXT,
|
||||
painted TEXT
|
||||
)
|
||||
""")
|
||||
# IMAGE
|
||||
self.cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS images(
|
||||
post_id INTEGER,
|
||||
url TEXT
|
||||
)
|
||||
""")
|
||||
# PHONE
|
||||
self.cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS phones(
|
||||
post_id INTEGER,
|
||||
phone TEXT
|
||||
)
|
||||
""")
|
||||
# EXTRA_FIELDS
|
||||
self.cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS extra_fields(
|
||||
post_id INTEGER,
|
||||
extra_field TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
self.cur.execute("SELECT * FROM cars ORDER BY id DESC LIMIT 1")
|
||||
result = self.cur.fetchone()
|
||||
if result is not None: self.id = result[0] + 1
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.cur.execute("""
|
||||
INSERT INTO cars (id, url, avto_salon, description, city, brand, model, year, category, color, engine_volume, engine_power, fuel_type, mileage, mileage_type, transmission, gear, price, currency, loan, barter, market, seats_count, prior_owners_count, crashed, painted) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
self.id,
|
||||
item['url'],
|
||||
item['avto_salon'],
|
||||
' '.join(item['description']),
|
||||
item['city'],
|
||||
item['brand'],
|
||||
item['model'],
|
||||
item['year'],
|
||||
item['category'],
|
||||
item['color'],
|
||||
item['engine_volume'],
|
||||
item['engine_power'],
|
||||
item['fuel_type'],
|
||||
item['mileage'],
|
||||
item['mileage_type'],
|
||||
item['transmission'],
|
||||
item['gear'],
|
||||
item['price'],
|
||||
item['currency'],
|
||||
item['loan'],
|
||||
item['barter'],
|
||||
item['market'],
|
||||
item['seats_count'],
|
||||
item['prior_owners_count'],
|
||||
item['crashed'],
|
||||
item['painted']
|
||||
))
|
||||
# IMAGE
|
||||
for image in item['images']:
|
||||
self.cur.execute("""INSERT INTO images (post_id, url) VALUES (?, ?)""",
|
||||
(self.id, image))
|
||||
# PHONE
|
||||
for phone in item['phone']:
|
||||
self.cur.execute("""INSERT INTO phones (post_id, phone) VALUES (?, ?)""",
|
||||
(self.id, phone))
|
||||
# EXTRA_FIELDS
|
||||
for field in item['extra_fields']:
|
||||
self.cur.execute("""INSERT INTO extra_fields (post_id, extra_field) VALUES (?, ?)""",
|
||||
(self.id, field))
|
||||
|
||||
self.con.commit()
|
||||
self.id += 1
|
||||
return item
|
||||
|
@ -60,11 +60,9 @@ ROBOTSTXT_OBEY = True
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 'post_scrape.pipelines.PostScrapePipeline': 300,
|
||||
#}
|
||||
# ITEM_PIPELINES = {
|
||||
# 'post_scrape.pipelines.SqlitePostPipeline': 300,
|
||||
# }
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
|
@ -6,8 +6,8 @@ class CarDataSpider(Spider):
|
||||
name = "turbo.az"
|
||||
allowed_domains = ('turbo.az',)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
|
||||
}
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
def __init__(self, first_page=1, last_page=2, *args, **kwargs):
|
||||
super(CarDataSpider, self).__init__(*args, **kwargs)
|
||||
|
Loading…
x
Reference in New Issue
Block a user