extracting data using bs4 from columns with same td class name - web-scraping

I'm trying to scrape a table from this URl https://finance.yahoo.com/quote/AAPL/history?p=AAPL.
this is my code:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from random import randint
url=('https://finance.yahoo.com/quote/AAPL/history?p=AAPL')
r=requests.get(url)
r
soup=BeautifulSoup(r.text, 'html.parser')
date=[]
t=soup.find_all(class_="W(100%) M(0)")
for i in t:
you=i.find_all('td',class_='Py(10px) Ta(start) Pend(10px)')
I have no problem getting the date column.
when I ran the below code for the 2nd column, it returns all the
data for the remaining 6 columns
for i in t:
u=i.find_all(class_='Py(10px) Pstart(10px)')
for k in u:
print(k.text)
I want to get all of each individual columns one at a time, that is for open, high, low, close etc. how can I accomplish this using bs4?

You could loop by select_one('td:nth-of-type(1)')
Example
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from random import randint
url=('https://finance.yahoo.com/quote/AAPL/history?p=AAPL')
r=requests.get(url)
r
soup=BeautifulSoup(r.text, 'html.parser')
date=[]
t=soup.find_all(class_="BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)")
for i in t:
if i.select_one('td:nth-of-type(3)'):
date = i.select_one('td:nth-of-type(1)').text
start = i.select_one('td:nth-of-type(2)').text
high = i.select_one('td:nth-of-type(3)').text
low = i.select_one('td:nth-of-type(4)').text
close = i.select_one('td:nth-of-type(5)').text
adjClose = i.select_one('td:nth-of-type(6)').text
volume = i.select_one('td:nth-of-type(7)').text
print(date, start, high, low, close, adjClose, volume)
Output
Dec 31, 2020 134.08 134.74 131.72 132.69 132.69 98,990,400
Dec 30, 2020 135.58 135.99 133.40 133.72 133.72 96,452,100
Dec 29, 2020 138.05 138.79 134.34 134.87 134.87 121,047,300
Dec 28, 2020 133.99 137.34 133.51 136.69 136.69 124,486,200
Dec 24, 2020 131.32 133.46 131.10 131.97 131.97 54,930,100
Dec 23, 2020 132.16 132.43 130.78 130.96 130.96 88,223,700
Dec 22, 2020 131.61 134.41 129.65 131.88 131.88 168,904,800
Dec 21, 2020 125.02 128.31 123.45 128.23 128.23 121,251,600

Related

How to add a default date in streamlit date_input

I have a date input and want to make yesterday as the default date.
import datetime
import streamlit as st
complition_date = st.date_input("Date of completion", datetime.date(2017, 08, 19))
How can I acomplish that?
import datetime
import streamlit as st
from datetime import date, timedelta
today = date.today()
default_date_yesterday = today - timedelta(days=1)
complition_date = st.date_input("Date of completion", default_date_yesterday)

Prices webscraping using BeautifulSoup

Goal: I'm trying to scrape prices
Expected Output: 2 columns 1)productName (OK) 2)price (Not OK, I have NaN)
I tried the following:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import time
urllib3.disable_warnings()
t0 = time.time()
page_proximus = urlopen("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus, 'html.parser')
scrap_list=pd.DataFrame(columns =['Item_name','Item_price'])
url = 'https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html'+ str(page_list)
req = urllib3
res = req.request
soup = BeautifulSoup(page_proximus, 'html.parser')
html = urlopen('https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html').read().decode("utf-8")
bs = BeautifulSoup(html, 'html.parser')
scrap_name = bs.find_all(["h1"])
product_name=pd.DataFrame(scrap_name,columns =['Item_name'])
scrap_price = bs.find_all ("span",{'class': 'rs-unit'})
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
scrap_list=scrap_list.append(pd.concat([product_name['Item_name'], product_price['Item_price']],
axis=1))
t1 = time.time()
r=t1-t0
print(r)
print(scrap_list)
The data is within the <meta> tags.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
t0 = time.time()
page_proximus = requests.get("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus.text, 'html.parser')
rows = []
metaData = soup.find_all('meta',{'property':'og:description'})
for meta in metaData:
row = {'Item_name':meta.find('meta',{'name':'device_model'})['content'],
'Item_price':meta.find('meta',{'name':'device_price'})['content']}
rows.append(row)
t1 = time.time()
r=t1-t0
print(r)
df = pd.DataFrame(rows)
print(df)
Output:
Item_name Item_price
0 iPhone 13 256GB Pink 1029,99

Bokeh LabelSet x axis being datetime

I am new to Bokeh and looking for solution to label each data point. Replicating the examples shown in documents, I could not find solutions with X axis being datetime.
import pandas as mypd
from bokeh.models import LabelSet , ColumnarDataSource
from bokeh.plotting import figure, output_file, show
date_1 = ['2020-01-01', '2020-01-02','2020-01-03','2020-01-04','2020-01-05']
sal = mypd.DataFrame(date_1)
sal.columns = ["Date_1"]
sal['Sales'] = [15,25,36,17,4]
sal['Date_1'] = mypd.to_datetime(sal['Date_1'])
p= figure(x_axis_type = "datetime")
p.line(x =sal['Date_1'] ,y = sal['Sales'])
lab = LabelSet(x = sal['Date_1'], y = sal['Sales'], text = sal['Sales'])
p.add_layout(lab)
show(p)
It is throwing the error
ValueError: expected an element of either String, Dict(Enum('expr', 'field', 'value', 'transform'), Either(String, Instance(Transform), Instance(Expression), Float)) or Float, got 0 2020-01-01
I understand the error is because x axis take numerical data for labelset.
Is my understanding correct ?
If yes what is the workaround ?
I tried with similar queries but could not find a solution for myself.
Similar Query
And this
The simplest solution is to just use a common data source. It also prevents you from embedding the data twice.
import pandas as pd
from bokeh.models import LabelSet, ColumnDataSource
from bokeh.plotting import figure, show
sal = (pd.DataFrame({'Date_1': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05']),
'Sales': [15, 25, 36, 17, 4]})
.set_index('Date_1'))
ds = ColumnDataSource(sal)
p = figure(x_axis_type="datetime")
p.line(x='Date_1', y='Sales', source=ds)
lab = LabelSet(x='Date_1', y='Sales', text='Sales', source=ds)
p.add_layout(lab)
show(p)

How to scrape this page with BeautifulSoup?

am trying to scrape the below page by using the below code in BeautifulSoup
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import lxml
url = 'https://remittanceprices.worldbank.org/en/corridor/Australia/China'
page=urlopen(url)
bs = BeautifulSoup(page,"lxml")
print(bs.get_text())
all_links=bs.find_all("div", {"class":"views-field views-field-title" })
for link in all_links:
content=link.get_text()
print (content)
all_links=bs.find_all("div", {"class":"mobile-header" })
for link in all_links:
content=link.get_text()
print (content)
Can you please provide some pointers to print/extract the data for all firms in the below format
Firm|product|Fee|Exchange rate margin(%)|Total Cost Percent(%)|Total Cost(AUD)
Bank of China|28.00|5.77|19.77|39.54
ANZ Bank|32.00|4.39|20.39|40.78
Regards
-Abacus
import requests
from bs4 import BeautifulSoup
url = 'https://remittanceprices.worldbank.org/en/corridor/Australia/China'
r = requests.get(url,verify=False)
soup = BeautifulSoup(r.text,'lxml')
rows = [i.get_text("|").split("|") for i in soup.select('#tab-1 .corridor-row')]
for row in rows:
#a,b,c,d,e = row[2],row[15],row[18],row[21],row[25]
#print(a,b,c,d,e,sep='|')
print('{0[2]}|{0[15]}|{0[18]}|{0[21]}|{0[25]}'.format(row))
Citibank|0.00|1.53|1.53|3.06
Transferwise|5.05|-0.04|2.48|4.96
Western Union|5.00|1.19|3.69|7.38
MoneyGram|8.00|1.06|5.06|10.12
WorldRemit|7.99|1.30|5.30|10.60
Ria|10.00|0.84|5.84|11.68
Ceylon Exchange|10.00|1.37|6.37|12.74
Western Union|9.95|1.69|6.66|13.32
Orbit Remit|13.00|0.78|7.28|14.56
Money2anywhere|12.00|1.71|7.71|15.42
SUPAY|18.00|-1.24|7.76|15.52
Money Chain Foreign Exchange|18.00|-1.12|7.88|15.76
MoneyGram|15.00|1.30|8.80|17.60
Commonwealth Bank|22.00|3.43|14.43|28.86
Bank of China|28.00|1.50|15.50|31.00
ANZ Bank|24.00|4.51|16.51|33.02
National Australia Bank (NAB)|22.00|5.74|16.74|33.48
Bank of China|32.00|1.50|17.50|35.00
Commonwealth Bank|30.00|3.43|18.43|36.86
ANZ Bank|32.00|4.51|20.51|41.02
National Australia Bank (NAB)|30.00|5.74|20.74|41.48

How to find frequencies of a days of a certain year using mapreduce and pyspark

I have a text file (61Gb) containing on each line, a string representing a date, e.g. Thu Dec 16 18:53:32 +0000 2010
Iterating the file on a single core would take too long, therefore I would like to use Pyspark and the Mapreduce technology to quickly find frequencies of lines for a day in a certain year.
What I think is a good start:
import dateutil.parser
text_file = sc.textFile('dates.txt')
date_freqs = text_file.map(lambda line: dateutil.parser.parse(line)) \
.map(lambda date: date + 1) \
.reduceByKey(lambda a, b: a + b)
Unfortunately I can't understand how to filter on a certain year and reduce by key. The key is the day.
Example output:
Thu Dec 16 26543
Thu Dec 17 345
etc.
As alluded to in another answer, dateutil.parser.parse returns a datetime object which has year, month, and day attributes:
>>> dt = dateutil.parser.parse('Thu Dec 16 18:53:32 +0000 2010')
>>> dt.year
2010
>>> dt.month
12
>>> dt.day
16
Starting with this RDD:
>>> rdd = sc.parallelize([
... 'Thu Oct 21 5:12:38 +0000 2010',
... 'Thu Oct 21 4:12:38 +0000 2010',
... 'Wed Sep 22 15:46:40 +0000 2010',
... 'Sun Sep 4 22:28:48 +0000 2011',
... 'Sun Sep 4 21:28:48 +0000 2011'])
Here's how you can get the counts for all year-month-day combinations:
>>> from operator import attrgetter
>>> counts = rdd.map(dateutil.parser.parse).map(
... attrgetter('year', 'month', 'day')).countByValue()
>>> counts
defaultdict(<type 'int'>, {(2010, 9, 22): 1, (2010, 10, 21): 2, (2011, 9, 4): 2})
To get the output you want:
>>> for k, v in counts.iteritems():
... print datetime.datetime(*k).strftime('%a %b %y'), v
...
Wed Sep 10 1
Thu Oct 10 2
Sun Sep 11 2
If you want counts for only a certain year, you can filter the RDD before doing the count:
>>> counts = rdd.map(dateutil.parser.parse).map(
... attrgetter('year', 'month', 'day')).filter(
... lambda (y, m, d): y == 2010).countByValue()
>>> counts
defaultdict(<type 'int'>, {(2010, 9, 22): 1, (2010, 10, 21): 2})
Something along the lines of this might be a good start:
import dateutil.parser
text_file = sc.textFile('dates.txt')
date_freqs = text_file.map(lambda line: dateutil.parser.parse(line))
.keyBy((_.year, _.month, _.day)) // somehow get the year, month, day to key by
.countByKey()
I should add that dateutil is not standard in Python. If you do not have sudo right on your cluster, this could pose a problem. As a solution I would like to propose using datetime:
import datetime
def parse_line(d):
f = "%a %b %d %X %Y"
date_list = d.split()
date = date_list[:4]
date.append(date_list[5])
date = ' '.join(date)
return datetime.datetime.strptime(date, f)
counts = rdd.map(parse_line)\
.map(attrgetter('year', 'month', 'day'))\
.filter(lambda (y, m, d): y == 2015)\
.countByValue()
I am interested in better solutions using: Parquet, Row/Columns etc.

Resources