Wrong page parsed BeautifulSoup? - web-scraping

I want to enter two values on this website https://hausratversicherung.friday.de/ and retrieve the value after submitting it. I wrote the following code
import requests, re
from robobrowser import RoboBrowser
br = RoboBrowser(parser='html.parser')
br.open("https://hausratversicherung.friday.de/")
form = br.get_form()
form['area'] = 100
form['postalCode'] = 44326
br.submit_form(form)
src = str(br.parsed())
start = '<div class="Typography-sc-3c3fuf-0 jEIicc" data-testid="totalPrice">'
end = ' €</div>'
result = re,search('%s(.*)%s' % (start, end),src).group(1)
print(result)
But the browser br is not opening the mentioned page and taking these values.

The postal code 44326 isn't accepted by the server. For other postal codes you can query their API directly:
import json
import requests
area = 100
postalcode = 44309
url = 'https://fdy2-policycenter-production.k8s.blue.friday-prod.de/rest/friday/hc/price?area={area}&postalCode={postalcode}'
data = requests.get(url.format(area=area, postalcode=postalcode)).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
# print some info to screen:
print(data['basicCoverages']['coverages'][0]['insuredSum']['amount'])
print(data['basicCoverages']['coverages'][0]['price']['amount'])
Prints:
65000.0
7.81

Related

my result only get 1 data, can you help me for get more data and save another file?

import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'http://h1.nobbd.de/index.php?start='
for page in range(1,10):
req = requests.get(URL + str(page) + '=')
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data = {"Company": [layanan], "Title:": [report], "Submit:": [bug_hunter], "Link:": [mirror], "Date:": [waktu]}
df = pd.DataFrame(data)
my result only get 1 data, can you help me for get more data and save another file?
df.head()
index
Company
Title:
Submit:
Link:
Date:
0
Reddit
Application level DOS at Login Page ( Accepts Long Password )
e100_speaks
https://hackerone.com/reports/1168804
03 Feb 2022
What happens?
Based on your questions code, you will overwrite your dataframe with every iteration, thats why you only get one result.
How to fix?
Create an empty list before your loops
Append all the extracted dicts to this list
Create your dataframe based on that list of dicts
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
url = 'http://h1.nobbd.de/index.php?start='
for page in range(1,3):
req = requests.get(url + str(page))
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data.append({'Company':[layanan], 'Title':[report], 'Submit':[hunter], 'link':[mirror], 'Date':[waktu]})
df = pd.DataFrame(data)

Instaloader data scraping using specific hashtag and timeframe

I need help using instaloader to data scrape posts from Instagram that include #slowfashion from a specific timeframe.
I want to scrape the visual and textual data from the posts (specifically, the image/s posted, their descriptions, and comments).
from datetime import datetime
from itertools import dropwhile, takewhile
import instaloader
# Use parameters to save diffrent metadata
L = instaloader.Instaloader(download_pictures=True,download_videos=False,download_comments=False,save_metadata=True)
# Login
username = input("Enter your username: ")
L.interactive_login(username=username)
# User Query
search = input("Enter Hashtag: ")
limit = int(input("How many posts to download: "))
# Hashtag object
hashtags = instaloader.Hashtag.from_name(L.context, search).get_posts()
# Download Period
SINCE = datetime(2021, 5, 1)
UNTIL = datetime(2021, 3, 1)
no_of_downloads = 0
for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, hashtags)):
if no_of_downloads == limit:
break
print(post.date)
L.download_post(post, "#"+search)
no_of_downloads += 1

How to import data from a HTML table on a website to excel?

I would like to do some statistical analysis with Python on the live casino game called Crazy Time from Evolution Gaming. There is a website that has the data to do this: https://tracksino.com/crazytime. I want the data of the lowest table 'Spin History' to be imported into excel. However, I do not now how this can be done. Could anyone give me an idea where to start?
Thanks in advance!
Try the below code:
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import csv
import datetime
def scrap_history():
csv_headers = []
file_path = '' #mention your system where you have to save the file
file_name = 'spin_history.csv' # filename
page_number = 1
while True:
#Dynamic URL fetching data in chunks of 100
url = 'https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=' + str(page_number) + '&per_page=100&period=24hours'
print('-' * 100)
print('URL created : ',url)
response = requests.get(url,verify=False)
result = json.loads(response.text) # loading data to convert in JSON.
history_data = result['data']
print(history_data)
if history_data != []:
with open(file_path + file_name ,'a+') as history:
#Headers for file
csv_headers = ['Occured At','Slot Result','Spin Result','Total Winners','Total Payout',]
csvwriter = csv.DictWriter(history, delimiter=',', lineterminator='\n',fieldnames=csv_headers)
if page_number == 1:
print('Writing CSV header now...')
csvwriter.writeheader()
#write exracted data in to csv file one by one
for item in history_data:
value = datetime.datetime.fromtimestamp(item['when'])
occured_at = f'{value:%d-%B-%Y # %H:%M:%S}'
csvwriter.writerow({'Occured At':occured_at,
'Slot Result': item['slot_result'],
'Spin Result': item['result'],
'Total Winners': item['total_winners'],
'Total Payout': item['total_payout'],
})
print('-' * 100)
page_number +=1
print(page_number)
print('-' * 100)
else:
break
Explanation:
I have implemented the above script using python requests way. The API url https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=1&per_page=50&period=24hours extarcted from the web site itself(refer screenshot). In the very first step script will take the dynamic URL where page number is dynamic and changed upon on every iteration. For ex:- first it will be page_num = 1 then page_num = 2 and so on till all the data will get extracted.

I'm Getting error json.decoder.JSONDecodeError: while running a python code

I have got this code from internet, for extracting data from justdial website.
While running this code I got the following error:
ERROR:json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) is shown.
Please help me to run this code as i'm not familiar with python. What changes should be done to run this code step by step.
Thank you in advance.
Here is my code:
import csv
import json
import requests
from bs4 import BeautifulSoup
print(25*"=")
print("Just Dial Scraper")
print(25*"=")
url = 'http://www.justdial.com/functions/ajxsearch.php?national_search=0&act'\
'=pagination&city={0}&search={1}&page={2}'
what = input("Enter your Query: ")
what = what.replace(' ', '+')
where = input("Enter the Location: ")
with open(what+"_"+where+'.csv', 'w') as f:
f.write('company, address, phone\n')
page = 1
while True:
print('Scraping Page', page)
resp = requests.get(url.format(where, what, page))
if not resp.json()['paidDocIds']:
print(25*"-")
print('Scraping Finished')
print(25*"-")
break
markup = resp.json()['markup'].replace('\/', '/')
soup = BeautifulSoup(markup, 'html.parser')
for thing in soup.find_all('section'):
csv_list = []
if thing.get('class') == ['jcar']:
# Company name
for a_tag in thing.find_all('a'):
if a_tag.get('onclick') == "_ct('clntnm', 'lspg');":
csv_list.append(a_tag.get('title'))
# Address
for span_tag in thing.find_all('span'):
if span_tag.get('class') == ['mrehover', 'dn']:
csv_list.append(span_tag.get_text().strip())
# Phone number
for a_tag in thing.find_all('a'):
if a_tag.get('href').startswith('tel:'):
csv_list.append(a_tag.get('href').split(':')[-1])
csv_list = ['"'+item+'"' for item in csv_list]
writeline = ','.join(csv_list)+'\n'
f.write(','.join(csv_list)+'\n')
page += 1

Unable to get data into correct format after web scraping in python

i have written a code that scrapes the websites: https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
but when i run this code, data is not formtted, like product name is coming in ever cell and so on price and image.
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("Scrapedetails.csv", "w")
Headers = "Item_Name, Price, Image\n"
f.write(Headers)
for page in range(1,15):
page_url = "https://www.newegg.com/Product/ProductList.aspx?
Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
html = urlopen(page_url)
bs0bj = BeautifulSoup(html, "html.parser")
page_details = bs0bj.find_all("div", {"class":"item-container"})
for i in page_details:
Item_Name = i.find("a", {"class":"item-title"})
Price = i.find("li", {"class":"price-current"})
Image = i.find("img")
Name_item = Item_Name.get_text()
Prin = Price.get_text()
imgf = Image["src"]# to get the key src
f.write("{}".format(Name_item).strip()+ ",{}".format(Prin).strip()+
",{}".format(imgf)+ "\n")
f.close()
can someone help me to ammend codes so that i can get name in name column, price in price column and image in image column.
what are the new ways to save data in csv,can someone help me in it with codes too?
Alright i got it solved.
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("Scrapedetails.csv", "w")
Headers = "Item_Name, Price, Image\n"
f.write(Headers)
for page in range(1,15):
page_url = "https://www.newegg.com/Product/ProductList.aspx?
Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
html = urlopen(page_url)
bs0bj = BeautifulSoup(html, "html.parser")
page_details = bs0bj.find_all("div", {"class":"item-container"})
for i in page_details:
Item_Name = i.find("a", {"class":"item-title"})
Price = i.find("li", {"class":"price-current"}).find('strong')
Image = i.find("img")
Name_item = Item_Name.get_text().strip()
prin = Price.get_text()
imgf = Image["src"]# to get the key src
print(Name_item)
print(prin)
print('https:{}'.format(imgf))
f.write("{}".format(Name_item).replace(",", "|")+ ",{}".format(prin)+ ",https:{}".format(imgf)+ "\n")
f.close()
These are the codes for anyone who wishes to start with webscraping a simplest way

Resources