Instaloader data scraping using specific hashtag and timeframe - web-scraping

I need help using instaloader to data scrape posts from Instagram that include #slowfashion from a specific timeframe.
I want to scrape the visual and textual data from the posts (specifically, the image/s posted, their descriptions, and comments).

from datetime import datetime
from itertools import dropwhile, takewhile
import instaloader
# Use parameters to save diffrent metadata
L = instaloader.Instaloader(download_pictures=True,download_videos=False,download_comments=False,save_metadata=True)
# Login
username = input("Enter your username: ")
L.interactive_login(username=username)
# User Query
search = input("Enter Hashtag: ")
limit = int(input("How many posts to download: "))
# Hashtag object
hashtags = instaloader.Hashtag.from_name(L.context, search).get_posts()
# Download Period
SINCE = datetime(2021, 5, 1)
UNTIL = datetime(2021, 3, 1)
no_of_downloads = 0
for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, hashtags)):
if no_of_downloads == limit:
break
print(post.date)
L.download_post(post, "#"+search)
no_of_downloads += 1

Related

Web Scraping: How do I return specific user input forms in python?

I'm having trouble with the forms returning an exact match for the user input.
Emphasoft developer challenge:
Taking a list of tax form names (ex: "Form W-2", "Form 1095-C"),
search the website and return some informational results.
Specifically, you must return the "Product Number", the "Title", and
the maximum and minimum years the form is available for download.
Taking a tax form name (ex: "Form W-2") and a range of years
(inclusive, 2018-2020 should fetch three years), download all PDFs
available within that range.
import json import os import sys import requests from bs4 import BeautifulSoup
URL = 'https://apps.irs.gov/app/picklist/list/priorFormPublication.html?resultsPerPage=200&sortColumn=sortOrder&indexOfFirstRow=0&{param.strip}&isDescending=false'
def get_forms(list_tax_form: list):
"""
function to get response from iris.gov with all forms content
:param list_tax_form: list of form names that we want to get info about
:return: dict with form name,form title
"""
response_list = [] # list for all responses of form names
with requests.session() as session:
for param in list_tax_form:
request_params = {'value': param,
'criteria': 'formNumber',
'submitSearch': 'Find',
}
res = session.get(URL, params=request_params).content
response_list.append(res)
return response_list
def parse_responses(list_tax_form: list):
"""
function to get all form names, titles years from previous func return
:param list_tax_form: list of form names that we want to get info about
:return: list of form names, titles, years
"""
responses = get_forms(list_tax_form)
# empty lists to fill them with the received information for all names, years, and titles
td_form_name, td_form_title, td_form_rev_year = [], [], []
for response in responses:
soup = BeautifulSoup(response, 'lxml')
td_name = soup.find_all('td', {'class': 'LeftCellSpacer'})
td_title = soup.find_all('td', {'class': 'MiddleCellSpacer'})
td_rev_year = soup.find_all('td', {'class': 'EndCellSpacer'})
td_form_name.extend(td_name)
td_form_title.extend(td_title)
td_form_rev_year.extend(td_rev_year)
return td_form_name, td_form_title, td_form_rev_year
def format_responses(list_tax_form: list):
"""
function to formate all responses for all forms we got!
1 Task
:param list_tax_form: list of form names that we want to get info about
:return: formated names,links,years
"""
td_names, td_titles, td_years = parse_responses(list_tax_form)
names = [name.text.strip() for name in td_names]
links = [link.find('a')['href'] for link in td_names]
titles = [title.text.strip() for title in td_titles]
years = [int(year.text.strip()) for year in td_years]
set_names = set(names)
final_dict = []
# loop to create dictionary of result information with years of tax form available to download
for name in set_names:
max_year = 0
min_year = max(years)
dict1 = {'form_number': name}
for index, p_name in enumerate(names):
if p_name == name:
if years[index] > max_year:
max_year = years[index]
elif years[index] < min_year:
min_year = years[index]
dict1['form_title'] = titles[index]
dict1['max_year'] = max_year
dict1['min_year'] = min_year
final_dict.append(dict1)
print(json.dumps(final_dict, indent=2))
return names, links, years
def download_files(list_tax_form):
"""
2 Task
Module to download pdf files of form_name that input from user.
:param list_tax_form: list of form names that we want to get info about
:return: message to user of successful create file or either
"""
names, links, years = format_responses(list_tax_form)
form_name = input('enter form name: ')
if form_name in names:
print('form exists. enter years range')
form_year1 = int(input('start year to analysis: '))
form_year2 = int(input('end year to analysis: '))
try:
os.mkdir(form_name)
except FileExistsError:
pass
# indecies to define names range in list of all tax form names
r_index = names.index(form_name) # index of first form_name mention on list
l_index = names.index(form_name) # index of last form_name mention on list
for name in names:
if name == form_name:
r_index += 1
years = years[l_index:r_index]
if form_year1 < form_year2:
range_years = range(form_year1, form_year2 + 1)
for year in range_years:
if year in years:
link = links[years.index(year)]
form_file = requests.get(link, allow_redirects=True)
open(f'{form_name}/{form_name}_{str(year)}.pdf', 'wb').write(form_file.content)
print(f'files saved to {form_name}/ directory!')
else:
print('input correct form name!')
if __name__ == '__main__':
tax_list = sys.argv[1:] # form names
download_files(tax_list)
(ex: "Form W-2" should not return "Form W-2 P")
When this file is ran, it is displaying other unrelated results.
How can I resolve this issue to display only specified user requests?

Wrong page parsed BeautifulSoup?

I want to enter two values on this website https://hausratversicherung.friday.de/ and retrieve the value after submitting it. I wrote the following code
import requests, re
from robobrowser import RoboBrowser
br = RoboBrowser(parser='html.parser')
br.open("https://hausratversicherung.friday.de/")
form = br.get_form()
form['area'] = 100
form['postalCode'] = 44326
br.submit_form(form)
src = str(br.parsed())
start = '<div class="Typography-sc-3c3fuf-0 jEIicc" data-testid="totalPrice">'
end = ' €</div>'
result = re,search('%s(.*)%s' % (start, end),src).group(1)
print(result)
But the browser br is not opening the mentioned page and taking these values.
The postal code 44326 isn't accepted by the server. For other postal codes you can query their API directly:
import json
import requests
area = 100
postalcode = 44309
url = 'https://fdy2-policycenter-production.k8s.blue.friday-prod.de/rest/friday/hc/price?area={area}&postalCode={postalcode}'
data = requests.get(url.format(area=area, postalcode=postalcode)).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
# print some info to screen:
print(data['basicCoverages']['coverages'][0]['insuredSum']['amount'])
print(data['basicCoverages']['coverages'][0]['price']['amount'])
Prints:
65000.0
7.81

How to import data from a HTML table on a website to excel?

I would like to do some statistical analysis with Python on the live casino game called Crazy Time from Evolution Gaming. There is a website that has the data to do this: https://tracksino.com/crazytime. I want the data of the lowest table 'Spin History' to be imported into excel. However, I do not now how this can be done. Could anyone give me an idea where to start?
Thanks in advance!
Try the below code:
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import csv
import datetime
def scrap_history():
csv_headers = []
file_path = '' #mention your system where you have to save the file
file_name = 'spin_history.csv' # filename
page_number = 1
while True:
#Dynamic URL fetching data in chunks of 100
url = 'https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=' + str(page_number) + '&per_page=100&period=24hours'
print('-' * 100)
print('URL created : ',url)
response = requests.get(url,verify=False)
result = json.loads(response.text) # loading data to convert in JSON.
history_data = result['data']
print(history_data)
if history_data != []:
with open(file_path + file_name ,'a+') as history:
#Headers for file
csv_headers = ['Occured At','Slot Result','Spin Result','Total Winners','Total Payout',]
csvwriter = csv.DictWriter(history, delimiter=',', lineterminator='\n',fieldnames=csv_headers)
if page_number == 1:
print('Writing CSV header now...')
csvwriter.writeheader()
#write exracted data in to csv file one by one
for item in history_data:
value = datetime.datetime.fromtimestamp(item['when'])
occured_at = f'{value:%d-%B-%Y # %H:%M:%S}'
csvwriter.writerow({'Occured At':occured_at,
'Slot Result': item['slot_result'],
'Spin Result': item['result'],
'Total Winners': item['total_winners'],
'Total Payout': item['total_payout'],
})
print('-' * 100)
page_number +=1
print(page_number)
print('-' * 100)
else:
break
Explanation:
I have implemented the above script using python requests way. The API url https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=1&per_page=50&period=24hours extarcted from the web site itself(refer screenshot). In the very first step script will take the dynamic URL where page number is dynamic and changed upon on every iteration. For ex:- first it will be page_num = 1 then page_num = 2 and so on till all the data will get extracted.

Why can't I access information in tbody?

[This is the source code of the website][1]I am doing web scraping with BeautifulSoup but cannot find tr in tbody; there actually is tr in tbody in the source code of the website; however, the find_all function can only return the tr in thead.
link I am scraping on: https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2019&group_by=year
Here are some of my code:
```from bs4 import BeautifulSoup
```url = "https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2019&group_by=year"
```html = urlopen(url)
```soup = BeautifulSoup(html,'lxml')
```type(soup)
```tr = soup.find_all("tr")
```print(tr)
[1]: https://i.stack.imgur.com/NFwEV.png
Data is requested via API returning json i.e. it is dynamically added so it doesn't appear in your request to the landing page. You can find the API endpoint in the network tab which is used to get the info.
You can alter one of the parameters to a number larger than expected result set then check if you need to make further requests.
import requests
r = requests.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=1&pageSize=2000&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
Otherwise, you can do an initial call and verify how many more requests to make and alter the appropriate paramters in the url. You can see the pageCount is returned.
You can see relevant parts in response here for pagesize 20:
{'rowCount': 1343,
'pageNum': 1,
'pageSize': '20',
'pageCount': 68,
All the relevant info for a loop to get all results is there.
After altering to larger number you can see the following:
'rowCount': 1343,
'pageNum': 1,
'pageSize': '2000',
'pageCount': 1,
You can convert to a table using pandas:
import requests
import pandas as pd
r = requests.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=1&pageSize=2000&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
df = pd.DataFrame(r['data'])
print(df)
Sample of df:
Example of checking actual count and make additional request for request of records:
import requests
import pandas as pd
request_number = 1000
with requests.Session() as s:
r = s.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=1&pageSize=' + str(request_number) + '&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
df = pd.DataFrame(r['data'])
actual_number = r['rowCount']
if actual_number > request_number:
request_number = actual_number - request_number
r = s.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=2&pageSize=' + str(request_number) + '&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
df2 = pd.DataFrame(r['data'])
final = pd.concat([df,df2])
else:
final = df
To get the tabular content using the selectors you see by inspecting elements, you can try using this pyppeteer which I've shown below how to work with. The following approach is an asynchronous one. So, I suggest you to go for this unless you find any api to play with:
import asyncio
from pyppeteer import launch
url = "https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2019&group_by=year"
async def get_table(link):
browser = await launch(headless=False)
[page] = await browser.pages()
await page.goto(link)
await page.waitForSelector("table.js-report-builder-table tr td")
for tr in await page.querySelectorAll("table.js-report-builder-table tr"):
tds = [await page.evaluate('e => e.innerText',td) for td in await tr.querySelectorAll("th,td")]
print(tds)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(get_table(url))
Output are like:
['Name', 'Organization', 'Date', 'Location', 'Attack', 'Type of Death', 'Charge']
['Abadullah Hananzai', 'Radio Azadi,Radio Free Europe/Radio Liberty', 'April 30, 2018', 'Afghanistan', 'Killed', 'Murder', '']
['Abay Hailu', 'Agiere', 'February 9, 1998', 'Ethiopia', 'Killed', 'Dangerous Assignment', '']
['Abd al-Karim al-Ezzo', 'Freelance', 'December 21, 2012', 'Syria', 'Killed', 'Crossfire', '']
['Abdallah Bouhachek', 'Révolution et Travail', 'February 10, 1996', 'Algeria', 'Killed', 'Murder', '']
['Abdel Aziz Mahmoud Hasoun', 'Masar Press', 'September 5, 2013', 'Syria', 'Killed', 'Crossfire', '']
['Abdel Karim al-Oqda', 'Shaam News Network', 'September 19, 2012', 'Syria', 'Killed', 'Murder', '']

how to iterate over multiple links and scrape everyone of them one by one and save the output in csv using python beautifulsoup and requests

I have this code but don't know how to read the links from a CSV or a list. I want to read the links and scrape details off every single link and then save the data in columns respected to each link into an output CSV.
Here is the code I built to get specific data.
from bs4 import BeautifulSoup
import requests
url = "http://www.ebay.com/itm/282231178856"
r = requests.get(url)
x = BeautifulSoup(r.content, "html.parser")
# print(x.prettify().encode('utf-8'))
# time to find some tags!!
# y = x.find_all("tag")
z = x.find_all("h1", {"itemprop": "name"})
# print z
# for loop done to extracting the title.
for item in z:
try:
print item.text.replace('Details about ', '')
except:
pass
# category extraction done
m = x.find_all("span", {"itemprop": "name"})
# print m
for item in m:
try:
print item.text
except:
pass
# item condition extraction done
n = x.find_all("div", {"itemprop": "itemCondition"})
# print n
for item in n:
try:
print item.text
except:
pass
# sold number extraction done
k = x.find_all("span", {"class": "vi-qtyS vi-bboxrev-dsplblk vi-qty-vert-algn vi-qty-pur-lnk"})
# print k
for item in k:
try:
print item.text
except:
pass
# Watchers extraction done
u = x.find_all("span", {"class": "vi-buybox-watchcount"})
# print u
for item in u:
try:
print item.text
except:
pass
# returns details extraction done
t = x.find_all("span", {"id": "vi-ret-accrd-txt"})
# print t
for item in t:
try:
print item.text
except:
pass
#per hour day view done
a = x.find_all("div", {"class": "vi-notify-new-bg-dBtm"})
# print a
for item in a:
try:
print item.text
except:
pass
#trending at price
b = x.find_all("span", {"class": "mp-prc-red"})
#print b
for item in b:
try:
print item.text
except:
pass
Your question is kind of vague!
Which links are you talking about? There are a hundred on a single ebay page. Which infos would you like to scrape? Similarly there is also a ton.
But anyway, here is I would proceed:
# First, create a list of urls you want to iterate on
urls = []
soup = (re.text, "html.parser")
# Assuming your links of interests are values of "href" attributes within <a> tags
a_tags = soup.find_all("a")
for tag in a_tags:
urls.append(tag["href"])
# Second, start to iterate while storing the info
info_1, info_2 = [], []
for link in urls:
# Do stuff here, maybe its time to define your existing loops as functions?
info_a, info_b = YourFunctionReturningValues(soup)
info_1.append(info_a)
info_2.append(info_b)
Then if you want a nice csv output:
# Don't forget to import the csv module
with open(r"path_to_file.csv", "wb") as my_file:
csv_writer = csv.writer(final_csv, delimiter = ",")
csv_writer.writerows(zip(urls, info_1, info_2, info_3))
Hope this will help?
Of course, don't hesitate to give additional info, so to have additional details
On attributes with BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attributes
About the csv module: https://docs.python.org/2/library/csv.html

Resources