Scrapy exports all results in one row in SQLite - sqlite

I'm making a basic spider using Scrapy and want to store the data with SQLite. The spider is working fine and saves the data I want, but it writes all data on the same row in the database.
Here's my spider:
def parse(self, response):
for sel in response.xpath('//*[#class="class"]'):
item = ScrapedItem()
item['Name'] = sel.xpath('*/header/div//h2/a/text()').extract()
item['Site'] = sel.xpath('*/header/div/a[1]/text()').extract()
item['Category'] = sel.xpath('*/header/div/h6[3]/text()').extract()
yield item
And here is my pipeline:
import sqlite3 as lite
from xyz import settings
from xyz import items
con = None
class Pipeline(object):
def __init__(self):
self.setupDBCon()
self.createTables()
def process_item(self, item, spider):
self.storeInfoInDb(item)
return item
def storeInfoInDb(self, item):
self.cur.execute("INSERT INTO Table(\
Name, \
Site, \
Category\
) \
VALUES( ?, ?, ?, ? )", \
( \
str(item.get('Name', '')),
str(item.get('Site', '')),
str(item.get('Category', ''))
))
print item.get('Name', '')
self.con.commit()
def setupDBCon(self):
self.con = lite.connect('test.db')
self.cur = self.con.cursor()
def __del__(self):
self.closeDB()
def createTables(self):
self.dropAgencyTable()
self.createAgencyTable()
def createTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Table(id INTEGER PRIMARY KEY NOT NULL, \
Name TEXT, \
Site TEXT, \
Category TEXT )")
def dropTable(self):
self.cur.execute("DROP TABLE IF EXISTS Agency")
def closeDB(self):
self.con.close()
How do I save my scraped data in one separate row per scraped item?

Look at this answer by me. The problem is your spider is extracting all the items and storing it in a list and the yielding it to the items pipeline. So the item pipeline is not getting the information one at a time. The solution for this is to use a loop and iterate over all the desired rows one by one.

Related

How to test a flask app using pytest to get the coverage rate up

Currently this is my app.py files
# imports - standard imports
import json
import os
import sqlite3
# imports - third party imports
from flask import Flask, Response, jsonify, redirect
from flask import render_template as render
from flask import request, url_for
DATABASE_NAME = "inventory.sqlite"
# setting up Flask instance
app = Flask(__name__)
app.config.from_mapping(
SECRET_KEY="dev",
DATABASE=os.path.join(app.instance_path, "database", DATABASE_NAME),
)
# listing views
link = {x: x for x in ["location", "product", "movement"]}
link["index"] = "/"
def init_database():
db = sqlite3.connect(DATABASE_NAME)
cursor = db.cursor()
# initialize page content
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS
products(prod_id INTEGER PRIMARY KEY AUTOINCREMENT,
prod_name TEXT UNIQUE NOT NULL,
prod_quantity INTEGER NOT NULL,
unallocated_quantity INTEGER);
"""
)
cursor.execute(
"""
CREATE TRIGGER IF NOT EXISTS default_prod_qty_to_unalloc_qty
AFTER INSERT ON products
FOR EACH ROW
WHEN NEW.unallocated_quantity IS NULL
BEGIN
UPDATE products SET unallocated_quantity = NEW.prod_quantity WHERE rowid = NEW.rowid;
END;
"""
)
# initialize page content
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS location(loc_id INTEGER PRIMARY KEY AUTOINCREMENT,
loc_name TEXT UNIQUE NOT NULL);
"""
)
# initialize page content
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS logistics(trans_id INTEGER PRIMARY KEY AUTOINCREMENT,
prod_id INTEGER NOT NULL,
from_loc_id INTEGER NULL,
to_loc_id INTEGER NULL,
prod_quantity INTEGER NOT NULL,
trans_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY(prod_id) REFERENCES products(prod_id),
FOREIGN KEY(from_loc_id) REFERENCES location(loc_id),
FOREIGN KEY(to_loc_id) REFERENCES location(loc_id));
"""
)
db.commit()
#app.route("/product", methods=["POST", "GET"])
def product() -> Response | str:
init_database()
msg = None
db = sqlite3.connect(DATABASE_NAME)
cursor = db.cursor()
cursor.execute("SELECT * FROM products")
products = cursor.fetchall()
if request.method == "POST":
prod_name = request.form["prod_name"]
quantity = request.form["prod_quantity"]
transaction_allowed = False
if prod_name not in ["", " ", None] and quantity not in ["", " ", None]:
transaction_allowed = True
if transaction_allowed:
try:
cursor.execute(
"INSERT INTO products (prod_name, prod_quantity) VALUES (?, ?)",
(prod_name, quantity),
)
db.commit()
except sqlite3.Error as e:
msg = f"An error occurred: {e.args[0]}"
else:
msg = f"{prod_name} added successfully"
if msg:
print(msg)
return redirect(url_for("product"))
return render(
"product.html",
link=link,
products=products,
transaction_message=msg,
title="Products Log",
)
and this is my test function in test_product.py. I want to test my function to get my coverage on sonarcloud to be 100%. And the pytest function below seems to have no use. I must say I am very beginner to it and I am still learning.
import requests
import app
import pytest
ENDPOINT = "http://127.0.0.1:5000/product"
app.init_database()
def test_product_GET():
response = requests.get(ENDPOINT)
assert response.status_code == 200
# assert "Products Log" in response.text
#pytest.fixture()
def test_product_POST_valid():
response = requests.post(ENDPOINT, data={"prod_name": "product1", "prod_quantity": "10"}, allow_redirects=True)
assert response.status_code == 200
# assert "product1 added successfully" in response.text
#pytest.fixture()
def test_product_POST_invalid():
response = requests.post(ENDPOINT, data={"prod_name": "", "prod_quantity": ""}, allow_redirects=True)
assert response.status_code == 200
# assert "An error occurred" in response.text
I am not sure of how to make this work wihtout using request and it does have 0% coverage and I want to test this code in sonarcloud

How can I open a db.sqlite3 file and have a look at its content?

I don't know how to open a db.sqlite3 file in reader-friendly way.
I hope the data in it would be shown in tables
Upload your file here and get the tabulated result:
http://inloop.github.io/sqlite-viewer/
OR run a Python script like below
def create_connection(db_file):
""" create a database connection to the SQLite database specified by the db_file :param db_file: database file :return: Connection object or None """ conn = None
try:
conn = sqlite3.connect(db_file) except Error as e:
print(e) return conn
def select_all_tasks(conn):
""" Query all rows in the tasks table :param conn: the Connection object :return: """
cur = conn.cursor()
cur.execute("SELECT * FROM tasks") rows = cur.fetchall() for row in rows: print(row)

Why is SQLAlchemy expecting 6 rows here when I want to delete two?

I am using Flask-SQLAlchemy and I have the following tables:
tags = db.Table('tags',
db.Column('tag_id', db.Integer, db.ForeignKey('tag.id'), primary_key=True),
db.Column('post_id', db.Integer, db.ForeignKey('post.id'), primary_key=True))
class Post(db.Model):
id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.String(250))
body = db.Column(db.Text)
timestamp = db.Column(db.DateTime, index=True, default=datetime.utcnow)
author = db.Column(db.String(64))
tags = db.relationship('Tag', secondary=tags, lazy='subquery',
backref=db.backref('posts', lazy=True))
published = db.Column(db.Boolean)
slug = db.Column(db.String(300), unique=True, index=True)
def save(self):
if not self.slug:
self.slug = re.sub('[^\w]+', '-', self.title.lower())
def update_time(self):
self.timestamp = datetime.utcnow()
def __repr__(self):
return '<Post {}>'.format(self.title)
class Tag(db.Model):
id = db.Column(db.Integer, primary_key=True)
tag = db.Column(db.String(64), index=True, unique=True)
def __repr__(self):
return '<Tag {}>'.format(self.tag)
Currently I am making a function to allow the user to delete a post entirely with db.session.delete(post) and I am getting the follwing error:
sqlalchemy.orm.exc.StaleDataError: DELETE statement on table 'tags' expected to delete 6 row(s); Only 2 were matched.
Why does the delete statement expect 6 rows? 2 is the correct number and corresponds to how many entries there are in the tags table for that post, two unique tags attached to this post.
It seems the issue here was with my route and not my models.
Bringing the delete function code to the beginning of the route gave me the expected behaviour; the post was deleted from the database and associations in the helper table also removed.
The error was caused by some code in my route that appended selected tags from the submitted form to post.tags multiple times over resulting in the expectation of 6 rows.

Adding UniqueKey constraint to a sqlite3 table with Flask-Migration fails with IntrgrityError

So I using sqlite as my test database and have the following classes in my models.py
class User(UserMixin, db.Model):
__tablename__ = 'users'
id = db.Column(db.Integer, primary_key=True, index=True)
username = db.Column(db.String(40), unique=True, index=True)
password_hash = db.Column(db.String(256))
alternate_id = db.Column(db.String(100))
posts = db.relationship('Posts', backref='author', lazy=True)
def get_id(self):
return str(self.alternate_id)
def __init__(self, username, password):
self.username = username
self.password_hash = generate_password_hash(password)
self.alternate_id = my_serializer.dumps(
self.username + self.password_hash)
def verify_password(self, password):
if check_password_hash(self.password_hash, password):
return "True"
class Posts(db.Model):
id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.String(100), nullable=False, unique=True)
description = db.Column(db.String(1500))
author_id = db.Column(db.Integer, db.ForeignKey('users.id'))
def __init__(self, title, description, author_id):
self.title = title
self.description = description
self.author_id = author_id
I added the unique key constraint to column title in my Posts class and then was trying to update the schema using Flask-Migrate.
Initially I was getting the No support for ALTER of constraints in SQLite dialect errors since sqlite3 does not support it through alembic. So I looked the alembic documentation and found that you can actually do such migrations using the batch mode migrations. So I updated my migration script as below.
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("posts") as batch_op:
batch_op.create_unique_constraint('unique_title', ['title'])
# ### end Alembic commands ###
Now when I try to run flask db upgrade I get the following error
sqlalchemy.exc.IntegrityError: (sqlite3.IntegrityError) UNIQUE constraint failed: _alembic_tmp_posts.title [SQL: 'INSERT INTO
_alembic_tmp_posts (id, title, description, author_id) SELECT posts.id, posts.title, posts.description, posts.author_id \nFROM posts'] (Background on this error at: http://sqlalche.me/e/gkpj`)
I am not able to understand that why IntegrityError exception is being thrown because if I look at the insert statement the number of columns are same.
Does it have something to do with the authors_id column having a foreignkey constraint on it ?
The database table column on which I was adding the unique constraint had duplicate data and that was the reason I was getting the integrity error, I am just surprised why I didn't notice that earlier.
So once I removed one of the duplicate rows, the database upgrade was successful.

How to use sqlalchemy to select data from a database?

I have two sqlalchemy scripts, one that creates a database and a few tables and another that selects data from them.
create_database.py
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey, select
engine = create_engine('sqlite:///test.db', echo=True)
metadata = MetaData()
addresses = Table ('addresses', metadata,
Column('id', Integer, primary_key=True),
Column('user_id', None, ForeignKey('users.id')),
Column('email_addresses', String, nullable=False)
)
users = Table ('users', metadata,
Column('id', Integer, primary_key=True),
Column('name', String),
Column('fullname', String),
)
metadata.create_all(engine)
select.py
from sqlalchemy import create_engine, select
engine = create_engine('sqlite:///test.db', echo=True)
conn = engine.connect()
s = select([users])
result = conn.execute(s)
I am able to run the create_database.py script but when I run the select.py script I get the following error
$ python select.py
Traceback (most recent call last):
File "select.py", line 5, in <module>
s = select([users])
I am able to run the select statement from within the create_database.py by appending the following to create_database.py
conn = engine.connect()
s = select([users])
result = conn.execute(s)
How can I run the select statements from a separate script than create_database.py
The script select.py does not see users and addresses defined in create_database.py. Import them in select.py before using them.
In select.py:
from create_database import users, addresses
## Do something with users and addresses

Resources