writing to and reading peewee - sqlite

I am writing a program that scrapes tweets of a number of people, if the body of the tweet is unique it will get stored in the sqlite database for that person. I have two files, one to write to the databases and one to read the database for and search for tweets with a search word. Before writing to databases I printed the tweets on the terminal, the tweets are being pulled from twitter correctly. When I try a search a term all databases have zero tweets, even if there is no term. There is either a problem with the writing or reading of the database. Please help, I appreciate that I am very new to python.
the writing file:
import requests
import datetime
from bs4 import BeautifulSoup
from peewee import *
from time import sleep
databases = ["femfreq.db", "boris_johnson.db", "barack_obama.db",
"daily_mail.db", "guardian.db", "times.db", "zac_goldsmith.db",
"bernie_sanders.db", "george_osborne.db", "john_mcdonnell.db",
"donald_trump.db", "hillary_clinton.db", "nigel_farage.db"]
urls = ["https://twitter.com/femfreq", "https://twitter.com/BorisJohnson",
"https://twitter.com/BarackObama",
"https://twitter.com/MailOnline?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://twitter.com/guardian?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://twitter.com/thetimes",
"https://twitter.com/ZacGoldsmith?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://twitter.com/berniesanders?lang=en-gb",
"https://twitter.com/George_Osborne?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://twitter.com/johnmcdonnellMP?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://twitter.com/realDonaldTrump?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://twitter.com/HillaryClinton?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor"
"https://twitter.com/Nigel_Farage?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor"]
selection = 0
for database_chosen in databases:
r = requests.get(urls[selection])
soup = BeautifulSoup(r.content, "html.parser")
content =soup.find_all("div",
{"class":
"content"})
db = SqliteDatabase(database_chosen)
class data_input(Model):
time_position = DateTimeField(default=datetime.datetime.now)
header = CharField()
time_posted = CharField()
tweet_body = CharField(unique=True)
class Meta:
database = db
db.connect()
db.create_tables([data_input], safe=True)
for i in content:
try:
data_input.create(header = i.contents[1].text,
time_posted = i.contents[3].text,
tweet_body = i.contents[5].text)
except IntegrityError:
pass
for i in content:
print("=============")
print(i.contents[1].text)
print(i.contents[3].text)
print(i.contents[5].text)
selection += 1
print("database: {} updated".format(database_chosen))
For the reading file
from peewee import *
import datetime
databases = ["femfreq.db", "boris_johnson.db", "barack_obama.db",
"daily_mail.db", "guardian.db", "times.db", "zac_goldsmith.db",
"bernie_sanders.db", "george_osborne.db", "john_mcdonnell.db",
"donald_trump.db", "hillary_clinton.db", "nigel_farage.db"]
search_results = []
search_index = 0
print("")
print("Please enter the number for the database you want to search: ")
for i in databases:
print("{}:{}".format(i, search_index))
search_index += 1
select = int(input("please select: "))
database_chosen = databases[select]
db = SqliteDatabase(database_chosen)
class data_input(Model):
time_position = DateTimeField(default=datetime.datetime.now)
header = CharField()
time_posted = CharField()
tweet_body = CharField(unique=True)
class Meta:
database = db
db.connect()
enteries = data_input.select().order_by(data_input.time_position.desc())
print(enteries)
enteries = enteries.where(data_input.tweet_body)
print("")
print("The total number of tweets in {} is: {}".format(database_chosen,
len(enteries)))
For the reading file I haven't put in a search function yet I will move to that when I can get this problem first. Many thanks

What are you intending to accomplish by putting ".where(data_input.tweet_body)" in the query to read entries? Try removing that whole line:
entries = entries.where(data_input.tweet_body)
When you go to add your search, at that time you will want to add a where clause...something like:
entries = entries.where(data_input.tweet_body.contains(search_term))

Related

WebScraping for downloading certain .csv files

I have this question. I need to download certain .csv files from a website as the title said, and i'm having troubles doing it. I'm very new on programming and especially with this topic(web scraping)
from bs4 import BeautifulSoup as BS
import requests
DOMAIN = 'https://datos.gob.ar'
URL = 'https://datos.gob.ar/dataset/cultura-mapa-cultural-espacios-culturales/'
FILETYPE = ".csv"
def get_soup(url):
return BS(requests.get(url).text, 'html.parser')
for link in get_soup(URL).find_all('a'):
file_link = link.get('href')
if FILETYPE in file_link:
print(file_link)
this code shows all avaibable .csv files but I just need to download those which end up with "biblioteca popular.csv" , "cine.csv" and "museos.csv"
Maybe it's a very simple task but I can not finding out
https://datos.cultura.gob.ar/dataset/37305de4-3cce-4d4b-9d9a-fec3ca61d09f/resource/456d1087-87f9-4e27-9c9c-1d9734c7e51d/download/biblioteca_especializada.csv
https://datos.cultura.gob.ar/dataset/37305de4-3cce-4d4b-9d9a-fec3ca61d09f/resource/01c6c048-dbeb-44e0-8efa-6944f73715d7/download/biblioteca_popular.csv
https://datos.cultura.gob.ar/dataset/37305de4-3cce-4d4b-9d9a-fec3ca61d09f/resource/8d0b7f33-d570-4189-9961-9e907193aebc/download/casas_bicentenario.csv
https://datos.cultura.gob.ar/dataset/37305de4-3cce-4d4b-9d9a-fec3ca61d09f/resource/4207def0-2ff7-41d5-9095-d42ae8207a5d/download/museos.csv
https://datos.cultura.gob.ar/dataset/37305de4-3cce-4d4b-9d9a-fec3ca61d09f/resource/392ce1a8-ef11-4776-b280-6f1c7fae16ae/download/cine.csv
https://datos.cultura.gob.ar/dataset/37305de4-3cce-4d4b-9d9a-fec3ca61d09f/resource/87ebac9c-774c-4ef2-afa7-044c41ee4190/download/teatro.csv
You can extract the JavaScript object housing that info which otherwise would be loaded to where you see if by JavaScript running in the browser. You then need to do some Unicode code point cleaning and string cleaning and parse as JSON. You can use a key word list to select from desired urls.
Unicode cleaning method by #Mark Tolonen
import json
import requests
import re
URL = 'https://datos.gob.ar/dataset/cultura-mapa-cultural-espacios-culturales/'
r = requests.get(URL)
search = ["Bibliotecas Populares", "Salas de Cine", "Museos"]
s = re.sub( r'\n\s{2,}', '', re.search(r'"#graph": (\[[\s\S]+{0}[\s\S]+)}}'.format(search[0]), r.text).group(1))
data = json.loads(re.sub(r'\\"', '', re.sub(r'\\u([0-9a-fA-F]{4})',lambda m: chr(int(m.group(1),16)),s)))
for i in data:
if 'schema:name' in i:
name = i['schema:name']
if name in search:
print(name)
print(i['schema:url'])

Python Pandas to sqlite

Hello I am new I had a question so I am trying to create a simple api using flask ..
the data I have is in CSV and I want to import it in to SQLlite file .. which I have done. and can access the data
after I data is loaded and I have conformed there is data. I try and get python to reflect the class .. as I need to confirm its there for flask ..
below is what I type.
Python:
Base = automap_base()
Base.prepare(engine, reflect=True)
Base.classes.keys()
I get nothing
know why I am getting nothing I its because there is no class set up before I load the data using pandas ..
below is the code I use to load the data to sqlite;
Base = declarative_base()
engine = create_engine("sqlite:///countrytwo.sqlite")
Base.metadata.create_all(engine)
file_name = 'us.csv'
df=pd.read_csv(file_name)
df.to_sql('us',con=engine, index_label='id', if_exists='replace')
## then to conform theirs data I do below ##
print (engine.table_names())
so do I know I need to set up the class first then load the data in to the sqlite file .. one does any one have a good webiste to do this ..
I would love a clue lead me to the answer but maybe not give me the answer ..
if this is unclear let me know I can load more code. thank you.
import sqlite3
import pandas as pd
import os
class readCSVintoDB():
def __init__(self):
'''
self.csvobj = csvOBJ
self.dbobj = dbOBJ
'''
self.importCSVintoDB()
def importCSVintoDB(self):
userInput= input("enter the path of the csv file: ")
csvfile = userInput
df = pd.read_csv(csvfile,sep=';')
#print("dataFrame Headers is {0}".format(df.columns))# display the Headers
dp = (df[['date','temperaturemin','temperaturemax']])
print(dp)
'''
check if DB file exist
if no create an empty db file
'''
if not(os.path.exists('./rduDB.db')):
open('./rduDB.db','w').close()
'''
connect to the DB and get a connection cursor
'''
myConn = sqlite3.connect('./rduDB.db')
dbCursor = myConn.cursor()
'''
Assuming i need to create a table of (Name,FamilyName,age,work)
'''
dbCreateTable = '''CREATE TABLE IF NOT EXISTS rduWeather
(id INTEGER PRIMARY KEY,
Date varchar(256),
TemperatureMin FLOAT,
TemperatureMax FLOAT)'''
dbCursor.execute(dbCreateTable)
myConn.commit()
'''
insert data into the database
'''
for i in dp:
print(i)
dbCursor.execute('''
INSERT INTO rduWeather VALUES (?,?,?,?)''', i)
myConn.commit()
mySelect=dbCursor.execute('''SELECT * from rduWeather WHERE (id = 10)''')
print(list(mySelect))
myConn.close()
test1 = readCSVintoDB()

Revit Python Shell - Change Parameter Group

I'm trying to write a quick script to open a family document, change the parameter group of 2 specified parameters, and then close and save the document. I've done multiple tests and I am able to change the parameter groups of the specified parameters, but the changes of the groups don't save back to the family file. When I open the newly saved family, the parameter groups revert back to their original group.
This is with Revit 2017.2.
The same script, when run in RPS in Revit 2018 will do as desired.
import clr
import os
clr.AddReference('RevitAPI')
clr.AddReference('RevitAPIUI')
from Autodesk.Revit.DB import *
from Autodesk.Revit.UI import UIApplication
from System.IO import Directory, SearchOption
searchstring = "*.rfa"
dir = r"C:\Users\dboghean\Desktop\vanity\2017"
docs = []
if Directory.Exists(dir):
files = Directory.GetFiles(dir, searchstring, SearchOption.AllDirectories)
for f in files:
name, extension = os.path.splitext(f)
name2, extension2 = os.path.splitext(name)
if extension2:
os.remove(f)
else:
docs.append(f)
else:
print("Directory does not exist")
doc = __revit__.ActiveUIDocument.Document
app = __revit__.Application
uiapp = UIApplication(app)
currentPath = doc.PathName
pgGroup = BuiltInParameterGroup.PG_GRAPHICS
for i in docs:
doc = app.OpenDocumentFile(i)
paramList = [i for i in doc.FamilyManager.Parameters]
t = Transaction(doc, "test")
t.Start()
for i in paramList:
if i.Definition.Name in ["Right Sidesplash Edge line", "Left Sidesplash Edge line"]:
i.Definition.ParameterGroup = pgGroup
t.Commit()
doc.Close(True)
Any ideas?
Thanks!
I can confirm that this happens in Revit 2017. Strange!
A simple way around it is to arbitrarily rename the parameter using doc.FamilyManager.RenameParameter, then rename it back to the original name.
So in your case this would be three additional lines of code after changing the Parameter group:
originalName = i.Definition.Name
doc.FamilyManager.RenameParameter(i, "temp")
doc.FamilyManager.RenameParameter(i, originalName)
Doesnt get to the root problem, but works around it

Django Haystack with elasticsearch returning empty queryset while data exists

I am doing a project in Python, django rest framework. I am using haystack SearchQuerySet. My code is here.
from haystack import indexes
from Medications.models import Salt
class Salt_Index(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr='name',null=True)
slug = indexes.CharField(model_attr='slug',null=True)
if_i_forget = indexes.CharField(model_attr='if_i_forget',null=True)
other_information = indexes.CharField(model_attr='other_information',null=True)
precautions = indexes.CharField(model_attr='precautions',null=True)
special_dietary = indexes.CharField(model_attr='special_dietary',null=True)
brand = indexes.CharField(model_attr='brand',null=True)
why = indexes.CharField(model_attr='why',null=True)
storage_conditions = indexes.CharField(model_attr='storage_conditions',null=True)
side_effects = indexes.CharField(model_attr='side_effects',null=True)
def get_model(self):
return Salt
def index_queryset(self, using=None):
return self.get_model().objects.all()
and my views.py file is -
from django.views.generic import View
from haystack.query import SearchQuerySet
from django.core import serializers
class Medication_Search_View(View):
def get(self,request,format=None):
try:
get_data = SearchQuerySet().all()
print get_data
serialized = ss.serialize("json", [data.object for data in get_data])
return HttpResponse(serialized)
except Exception,e:
print e
my python manage.py rebuild_index is working fine (showing 'Indexing 2959 salts') but in my 'views.py' file , SearchQuerySet() is returning an empty query set...
I am very much worried for this. Please help me friends if you know the reason behind getting empty query set while I have data in my Salt model.
you should check app name it is case sensitive.try to write app name in small letters
My problem is solved now. The problem was that i had wriiten apps name with capital letters and the database tables were made in small letters(myapp_Student). so it was creating problem on database lookup.

Can I get value of actual write capacity of DynamoDB or DynamoDB2 table

Suppose I access an existing DynamoDB
import boto
conn = boto.connect_dynamodb(...)
table = conn.get_table(tableName)
or a DynamoDB2
import boto
from boto.dynamodb2.layer1 import DynamoDBConnection
from boto.dynamodb2.table import Table
conn = DynamoDBConnection(...)
table = Table(tableName, connection=conn)
table. I want to know how much data was written to it right before I accessed it. So I don't want the provisioned write throughput value but the actual throughput. How can I get this info?
Something like this should work:
import boto.ec2.cloudwatch
import datetime
end = datetime.datetime.utcnow()
start = end - datetime.timedelta(minutes=5)
c = boto.ec2.cloudwatch.connect_to_region('us-east-1')
data = c.get_metric_statistics(period=60, start_time=start, end_time=end,
metric_name='ConsumedWriteCapacityUnits', namespace='AWS/DynamoDB',
statistics=['Sum'],dimensions={'TableName': 'mytable'})
This should a list of data points. You should average all of the sums in the list and then divide that number by 300, the period.

Resources