Can someone please guide me as to how I can go about extracting data from this particular table? I have tried it multiple times but have not succeeded in extracting the required data.
`import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://etfdb.com/etf/ICLN/#fact-sheet', proxies = proxy_support).text
soup = bs(r,'html.parser')
da = soup.find_all('ul', {'class':'list-unstyled'})[0]
n_rows = 0
n_columns = 0
column_names = []
for row in da.find_all('li'):
td_tags = row.find('span')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all('a href')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns, index= range(0,n_rows))
row_marker = 0
for row in da.find_all('li'):
column_marker = 0
columns = row.find_all('span')
for column in columns:
df.iat[row_marker,column_marker] = columns.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
For the code above I get the following error :
AttributeError: ResultSet object has no attribute 'get_text'. You're
probably treating a list of items like a single item. Did you call
find_all() when you meant to call find()?
Can any one tell me what I am doing wrong?
With bs4 4.7.1. to get first table
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://etfdb.com/etf/ICLN/#fact-sheet')
soup = bs(r.content, 'lxml')
items = soup.select('h3:contains(Vitals) + ul li')
for item in items:
print([i.text for i in item.select('span')])
Earlier bs versions
items = soup.select_one('h3 + ul').select('li')
for item in items:
print([i.text for i in item.select('span')])
Related
Whenever I run this code, I get the pseudo-class is not implemented error. I found this code online and I am trying to scrape the relevant information about the cities from Wikipedia.
I have updated python and beautiful soup to their most recent versions. Any help is greatly appreciated.
import requests
import bs4
from bs4 import BeautifulSoup as bs
import pandas as pd
import unicodedata
import re
# cities = ['Berlin', 'Hamburg', 'Frankfurt','Munich','Stuttgart','Leipzig','Cologne','Dresden','Hannover','Paris', 'Barcelona','Lisbon','Madrid']
cities = ['Berlin','Paris','Amsterdam','Barcelona','Rome','Lisbon','Prague','Vienna','Madrid']
def City_info(soup):
ret_dict = {}
ret_dict['city'] = soup.h1.get_text()
if soup.select_one('.mergedrow:-soup-contains("Mayor")>.infobox-label') != None:
i = soup.select_one('.mergedrow:-soup-contains("Mayor")>.infobox-label')
mayor_name_html = i.find_next_sibling()
mayor_name = unicodedata.normalize('NFKD',mayor_name_html.get_text())
ret_dict['mayor'] = mayor_name
if soup.select_one('.mergedrow:-soup-contains("City")>.infobox-label') != None:
j = soup.select_one('.mergedrow:-soup-contains("City")>.infobox-label')
area = j.find_next_sibling('td').get_text()
ret_dict['city_size'] = unicodedata.normalize('NFKD',area)
if soup.select_one('.mergedtoprow:-soup-contains("Elevation")>.infobox-data') != None:
k = soup.select_one('.mergedtoprow:-soup-contains("Elevation")>.infobox-data')
elevation_html = k.get_text()
ret_dict['elevation'] = unicodedata.normalize('NFKD',elevation_html)
if soup.select_one('.mergedtoprow:-soup-contains("Population")') != None:
l = soup.select_one('.mergedtoprow:-soup-contains("Population")')
c_pop = l.findNext('td').get_text()
ret_dict['city_population'] = c_pop
if soup.select_one('.infobox-label>[title^=Urban]') != None:
m = soup.select_one('.infobox-label>[title^=Urban]')
u_pop = m.findNext('td')
ret_dict['urban_population'] = u_pop.get_text()
if soup.select_one('.infobox-label>[title^=Metro]') != None:
n = soup.select_one('.infobox-label>[title^=Metro]')
m_pop = n.findNext('td')
ret_dict['metro_population'] = m_pop.get_text()
if soup.select_one('.latitude') != None:
o = soup.select_one('.latitude')
ret_dict['lat'] = o.get_text()
if soup.select_one('.longitude') != None:
p = soup.select_one('.longitude')
ret_dict['long'] = p.get_text()
return ret_dict
list_of_city_info = []
for city in cities:
url = 'https://en.wikipedia.org/wiki/{}'.format(city)
web = requests.get(url,'html.parser')
soup = bs(web.content)
list_of_city_info.append(City_info(soup))
df_cities = pd.DataFrame(list_of_city_info)
df_cities = df_cities.set_index('city')
df_cities
I have not found any solutions for this unfortunately.
:-soup-contains is a css pseudo class selector to target a node's text.
It comes with Soup Sieve that is the official CSS select implementation of Beautiful Soup 4.7.0+, so for most people, using Beautiful Soup 4.7.0+ your script should work fine.
So first check if your version is up to date in older version deprecated form of :contains() is used.
With Beautifulsoup4 and python3.7 I'm trying to loop some arrays with links. After, want to get some text from tags. But I'm encountering and error passing the code on the terminal.
Here the code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
import csv
my_url = "http://www.example.com"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
links = page_soup.select('dt > a[href]')
link = [tag.get('href') for tag in links]
i = 0
for i in range(0, 5000):
url = link[i]
Client = uReq(url)
pageHtml = Client.read()
Client.close()
pSoup = soup(pageHtml, "html.parser")
linkeas = pSoup.findAll(href=re.compile(my_url))
def linkas(href):
return href and re.compile("html").search(href) and re.compile(my_url).search(href)
linka = pSoup.findAll(href=linkas)
if linka != []:
linkia = [tag.get('href') for tag in linka]
linko = len(linkia)
j = 0
for j in range (0, linko):
curl = linkia[j]
cClient = uReq(curl)
pageHtml = cClient.read()
cClient.close()
Soup = soup(page_html, "html.parser")
country = Soup.select('.class > a:nth-of-type(3)')
countri = country[0].text.strip()
print(countri)
I've tried for days several ways but got so far as this with no results:
Traceback (most recent call last):
File "<stdin>", line 22, in <module>
IndexError: list index out of range
Could someone give some tip?
NOTE:
Arrays show like this:
print(linkia)
['http://www.example/example/1.html']
['http://www.example/example/2.html']
['http://www.example/example/3.html', 'http://www.example/example/4.html',
'http://www.example/example/5.html', 'http://www.example/example/6.html',
'http://www.example/example/7.html', 'http://www.example/example/8.html',
'http://www.example/example/9.html', 'http://www.example/example/10.html',
'http://www.example/example/11.html', 'http://www.example/example/12.html',
'http://www.example/example/13.html', 'http://www.example/example/14.html',
'http://www.example/example/15.html', 'http://www.example/example/16.html',
'http://www.example/example/17.html', 'http://www.example/example/18.html',
'http://www.example/example/19.html']
['http://www.example/example/20.html', 'http://www.example/example/example/21.html',
'http://www.example/example/example/22.html']
['http://www.example/example/23.html']
Thanks a lot for your time. Really appreciate. Will be connected all time with fast response.
change:
i = 0
for i in range(0, 5000):
url = link[i]
to just:
for url in link:
And then can get rid of the url = link[i]
You're essentially telling it to loop through 5000 items in your list, when you don't have 5000 items, hence the list index out of range. You really just want it to loop through each element until it runs out of items. And you can do that by simply saying for url in link:
Then the same for your other nested for loop.
change:
j = 0
for j in range (0, linko):
curl = linkia[j]
to:
for curl in linkia:
I will also note that if you were to set it up the way you have it, you wouldn't need to set the initial i or j to be = 0. Since you set the range/list to go from 0, 5000...the for loop would automatically start at that first element of 0. But again, that point is irrelevant, as I would not recommend iterating through your list like that. It a) isn't robust (you would need exactly 5000 items in your list every time it gets to that loop), and b) while it would work ok for your second loop because you set the range from 0, to the length of the list, it really is unnecessary since you can condense that into 1 line.
Try:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
import csv
my_url = "http://www.example.com"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
links = page_soup.select('dt > a[href]')
link = [tag.get('href') for tag in links]
for url in link:
Client = uReq(url)
pageHtml = Client.read()
Client.close()
pSoup = soup(pageHtml, "html.parser")
linkeas = pSoup.findAll(href=re.compile(my_url))
def linkas(href):
return href and re.compile("html").search(href) and re.compile(my_url).search(href)
linka = pSoup.findAll(href=linkas)
if linka != []:
linkia = [tag.get('href') for tag in linka]
for curl in linkia:
cClient = uReq(curl)
pageHtml = cClient.read()
cClient.close()
Soup = soup(page_html, "html.parser")
country = Soup.select('.class > a:nth-of-type(3)')
countri = country[0].text.strip()
print(countri)
I am trying to use the bokeh server to plot a time series together with a shaded percentile band around, and this, since bokeh does not support the fill_between function from matplotlib, requires the construction of a patch object of double dimension. Hence, I need two ColumnDataSources to hold the data. However, only the first curve is rendered correctly when the data changes. Although the data_source of the GlyphRenderer is updated, the figure does not change. I use bokeh 0.12.3, and have tried with several servers and browsers. A complete, and reasonably minimal example:
import numpy as np
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.layouts import column
from bokeh.io import curdoc
from bokeh.models.widgets import Select
class AppData:
def __init__(self, n):
self.p_source = None
self.c_source = None
self.x = np.linspace(0, 10, 20)
self.n = n
self.ys = [np.sin(self.x) - i for i in range(self.n)]
self.line = None
self.patch = None
def update_module(self, a, b):
assert b - a == 5
p_data = dict() if self.p_source is None else self.p_source.data
c_data = dict() if self.c_source is None else self.c_source.data
ys = [self.ys[j] for j in range(a, b)]
if "x" not in c_data:
c_data["x"] = self.x
p_data["x"] = c_data["x"].tolist() + c_data["x"][::-1].tolist()
n_r = len(ys[0])
n_p = 2*n_r
if "ys" not in p_data:
p_data["ys"] = np.empty((n_p))
p_data["ys"][:n_r] = ys[0]
p_data["ys"][n_r:] = np.flipud(ys[-1])
c_data["y"] = ys[2]
if self.p_source is None:
self.p_source = ColumnDataSource(data=p_data)
else:
self.p_source.data.update(p_data)
if self.c_source is None:
self.c_source = ColumnDataSource(data=c_data)
else:
self.c_source.data.update(c_data)
if self.line is not None:
print(max(self.line.data_source.data["y"]))
print(max(self.patch.data_source.data["ys"])) # The value changes, but the figure does not!
# initialize
app_data = AppData(10)
app_data.update_module(4, 4 + 5)
s1 = figure(width=500, plot_height=125, title=None, toolbar_location="above")
app_data.line = s1.line("x", "y", source=app_data.c_source)
app_data.patch = s1.patch("x", "ys", source=app_data.p_source, alpha=0.3, line_width=0)
select = Select(title="Case", options=[str(i) for i in range(5)], value="4")
def select_case(attrname, old, new):
a = int(select.value)
app_data.update_module(a, a + 5)
select.on_change('value', select_case)
layout = column(select, s1)
curdoc().add_root(layout)
curdoc().title = "Example of patches not being updated"
I am certainly not very experienced in using bokeh, so I could very well be using the system wrong. However, any help on this matter would be of great help!
I'm trying to write a function that accepts a 2-dimensional (2D) list of characters (like a crossword puzzle) and a string as input arguments, the function must then search the columns of the 2D list to find a match of the word. If a match is found, the function should then return a list containing the row index and column index of the start of the match, otherwise it should return the value None.
For example if the function is called as shown below:
crosswords = [['s','d','o','g'],['c','u','c','m'],['a','c','a','t'],['t','e','t','k']]
word = 'cat'
find_word_vertical(crosswords,word)
then the function should return:
[1,0]
def find_word_vertical(crosswords,word):
columns = []
finished = []
for col in range(len(crosswords[0])):
columns.append( [crosswords[row][col] for row in
range(len(crosswords))])
for a in range(0, len(crosswords)):
column = [crosswords[x][a] for x in range(len(crosswords))]
finished.append(column)
for row in finished:
r=finished.index(row)
whole_row = ''.join(row)
found_at = whole_row.find(word)
if found_at >=0:
return([found_at, r])
This one is for finding horizontal... could switching this around help?
def find_word_horizontal(crosswords, word):
list1=[]
row_index = -1
column_index = -1
refind=''
for row in crosswords:
index=''
for column in row:
index= index+column
list1.append(index)
for find_word in list1:
if word in find_word:
row_index = list1.index(find_word)
refind = find_word
column_index = find_word.index(word)
ret = [row_index,column_index]
if row_index!= -1 and column_index != -1:
return ret
The simple version is:
def find_word_vertical(crosswords,word):
z=[list(i) for i in zip(*crosswords)]
for rows in z:
row_index = z.index(rows)
single_row = ''.join(rows)
column_index = single_row.find(word)
if column_index >= 0:
return([column_index, row_index])
This gives correct output [1,0]
To find a word vertically:
def find_word_vertical(crosswords,word):
if not crosswords or not word:
return None
for col_index in range(len(crosswords[0])):
str = ''
for row_index in range(len(crosswords)):
str = str + crosswords[row_index][col_index]
if temp_str.find(word) >= 0:
return [str.find(word),col_index]
To find a word Horizontaly:
def find_word_horizontal(crosswords, word):
if not crosswords or not word:
return None
for index, row in enumerate(crosswords):
str = ''.join(row)
if str.find(word) >= 0:
return [index,str.find(word)]
#find vertical word in 2d
def find_it(li,wo):
out_list=[]
for row in range(len(li)):
print(row)
chek_word=""
for item in range(len(li)):
chek_word=chek_word + li[item][row]
print(chek_word)
if wo in chek_word:
print(chek_word.find(wo))
out_list=[ chek_word.find(wo) , row]
print(out_list)
break
this is mine and yes it work
I want to do pairwise alignment with uniprot and pdb sequences. I have an input file containing uniprot and pdb IDs like this.
pdb id uniprot id
1dbh Q07889
1e43 P00692
1f1s Q53591
first, I need to read each line in an input file
2) retrieve the pdb and uniprot sequences from pdb.fasta and uniprot.fasta files
3) Do alignment and calculate sequence identity.
Usually, I use the following program for pairwise alignment and seq.identity calculation.
library("seqinr")
seq1 <- "MDEKRRAQHNEVERRRRDKINNWIVQLSKIIPDSSMESTKSGQSKGGILSKASDYIQELRQSNHR"
seq2<- "MKGQQKTAETEEGTVQIQEGAVATGEDPTSVAIASIQSAATFPDPNVKYVFRTENGGQVM"
library(Biostrings)
globalAlign<- pairwiseAlignment(seq1, seq2)
pid(globalAlign, type = "PID3")
I need to print the output like this
pdbid uniprotid seq.identity
1dbh Q07889 99
1e43 P00692 80
1f1s Q53591 56
How can I change the above code ? your help would be appreciated!
'
This code is hopefully what your looking for:
class test():
def get_seq(self, pdb,fasta_file): # Get sequences
from Bio.PDB.PDBParser import PDBParser
from Bio import SeqIO
aa = {'ARG':'R','HIS':'H','LYS':'K','ASP':'D','GLU':'E','SER':'S','THR':'T','ASN':'N','GLN':'Q','CYS':'C','SEC':'U','GLY':'G','PRO':'P','ALA':'A','ILE':'I','LEU':'L','MET':'M','PHE':'F','TRP':'W','TYR':'Y','VAL':'V'}
p=PDBParser(PERMISSIVE=1)
structure_id="%s" % pdb[:-4]
structure=p.get_structure(structure_id, pdb)
residues = structure.get_residues()
seq_pdb = ''
for res in residues:
res = res.get_resname()
if res in aa:
seq_pdb = seq_pdb+aa[res]
handle = open(fasta_file, "rU")
for record in SeqIO.parse(handle, "fasta") :
seq_fasta = record.seq
handle.close()
self.seq_aln(seq_pdb,seq_fasta)
def seq_aln(self,seq1,seq2): # Align the sequences
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
matrix = matlist.blosum62
gap_open = -10
gap_extend = -0.5
alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
top_aln = alns[0]
aln_seq1, aln_seq2, score, begin, end = top_aln
with open('aln.fasta', 'w') as outfile:
outfile.write('> PDB_seq\n'+str(aln_seq1)+'\n> Uniprot_seq\n'+str(aln_seq2))
print aln_seq1+'\n'+aln_seq2
self.seq_id('aln.fasta')
def seq_id(self,aln_fasta): # Get sequence ID
import string
from Bio import AlignIO
input_handle = open("aln.fasta", "rU")
alignment = AlignIO.read(input_handle, "fasta")
j=0 # counts positions in first sequence
i=0 # counts identity hits
for record in alignment:
#print record
for amino_acid in record.seq:
if amino_acid == '-':
pass
else:
if amino_acid == alignment[0].seq[j]:
i += 1
j += 1
j = 0
seq = str(record.seq)
gap_strip = seq.replace('-', '')
percent = 100*i/len(gap_strip)
print record.id+' '+str(percent)
i=0
a = test()
a.get_seq('1DBH.pdb','Q07889.fasta')
This outputs:
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------EQTYYDLVKAF-AEIRQYIRELNLIIKVFREPFVSNSKLFSANDVENIFSRIVDIHELSVKLLGHIEDTVE-TDEGSPHPLVGSCFEDLAEELAFDPYESYARDILRPGFHDRFLSQLSKPGAALYLQSIGEGFKEAVQYVLPRLLLAPVYHCLHYFELLKQLEEKSEDQEDKECLKQAITALLNVQSG-EKICSKSLAKRRLSESA-------------AIKK-NEIQKNIDGWEGKDIGQCCNEFI-EGTLTRVGAKHERHIFLFDGL-ICCKSNHGQPRLPGASNAEYRLKEKFF-RKVQINDKDDTNEYKHAFEIILKDENSVIFSAKSAEEKNNW-AALISLQYRSTL---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
MQAQQLPYEFFSEENAPKWRGLLVPALKKVQGQVHPTLESNDDALQYVEELILQLLNMLCQAQPRSASDVEERVQKSFPHPIDKWAIADAQSAIEKRKRRNPLSLPVEKIHPLLKEVLGYKIDHQVSVYIVAVLEYISADILKLVGNYVRNIRHYEITKQDIKVAMCADKVLMDMFHQDVEDINILSLTDEEPSTSGEQTYYDLVKAFMAEIRQYIRELNLIIKVFREPFVSNSKLFSANDVENIFSRIVDIHELSVKLLGHIEDTVEMTDEGSPHPLVGSCFEDLAEELAFDPYESYARDILRPGFHDRFLSQLSKPGAALYLQSIGEGFKEAVQYVLPRLLLAPVYHCLHYFELLKQLEEKSEDQEDKECLKQAITALLNVQSGMEKICSKSLAKRRLSESACRFYSQQMKGKQLAIKKMNEIQKNIDGWEGKDIGQCCNEFIMEGTLTRVGAKHERHIFLFDGLMICCKSNHGQPRLPGASNAEYRLKEKFFMRKVQINDKDDTNEYKHAFEIILKDENSVIFSAKSAEEKNNWMAALISLQYRSTLERMLDVTMLQEEKEEQMRLPSADVYRFAEPDSEENIIFEENMQPKAGIPIIKAGTVIKLIERLTYHMYADPNFVRTFLTTYRSFCKPQELLSLIIERFEIPEPEPTEADRIAIENGDQPLSAELKRFRKEYIQPVQLRVLNVCRHWVEHHFYDFERDAYLLQRMEEFIGTVRGKAMKKWVESITKIIQRKKIARDNGPGHNITFQSSPPTVEWHISRPGHIETFDLLTLHPIEIARQLTLLESDLYRAVQPSELVGSVWTKEDKEINSPNLLKMIRHTTNLTLWFEKCIVETENLEERVAVVSRIIEILQVFQELNNFNGVLEVVSAMNSSPVYRLDHTFEQIPSRQKKILEEAHELSEDHYKKYLAKLRSINPPCVPFFGIYLTNILKTEEGNPEVLKRHGKELINFSKRRKVAEITGEIQQYQNQPYCLRVESDIKRFFENLNPMGNSMEKEFTDYLFNKSLEIEPRNPKPLPRFPKKYSYPLKSPGVRPSNPRPGTMRHPTPLQQEPRKISYSRIPESETESTASAPNSPRTPLTPPPASGASSTTDVCSVFDSDHSSPFHSSNDTVFIQVTLPHGPRSASVSSISLTKGTDEVPVPPPVPPRRRPESAPAESSPSKIMSKHLDSPPAIPPRQPTSKAYSPRYSISDRTSISDPPESPPLLPPREPVRTPDVFSSSPLHLQPPPLGKKSDHGNAFFPNSPSPFTPPPPQTPSPHGTRRHLPSPPLTQEVDLHSIAGPPVPPRQSTSQHIPKLPPKTYKREHTHPSMHRDGPPLLENAHSS
PDB_seq 100 # pdb to itself would obviously have 100% identity
Uniprot_seq 24 # pdb sequence has 24% identity to the uniprot sequence
For this to work on you input file, you need to put my a.get_seq() in a for loop with the inputs from your text file.
EDIT:
Replace the seq_id function with this one:
def seq_id(self,aln_fasta):
import string
from Bio import AlignIO
from Bio import SeqIO
record_iterator = SeqIO.parse(aln_fasta, "fasta")
first_record = record_iterator.next()
print '%s has a length of %d' % (first_record.id, len(str(first_record.seq).replace('-','')))
second_record = record_iterator.next()
print '%s has a length of %d' % (second_record.id, len(str(second_record.seq).replace('-','')))
lengths = [len(str(first_record.seq).replace('-','')), len(str(second_record.seq).replace('-',''))]
if lengths.index(min(lengths)) == 0: # If both sequences have the same length the PDB sequence will be taken as the shortest
print 'PDB sequence has the shortest length'
else:
print 'Uniport sequence has the shortes length'
idenities = 0
for i,v in enumerate(first_record.seq):
if v == '-':
pass
#print i,v, second_record.seq[i]
if v == second_record.seq[i]:
idenities +=1
#print i,v, second_record.seq[i], idenities
print 'Sequence Idenity = %.2f percent' % (100.0*(idenities/min(lengths)))
to pass the arguments to the class use:
with open('input_file.txt', 'r') as infile:
next(infile)
next(infile) # Going by your input file
for line in infile:
line = line.split()
a.get_seq(segs[0]+'.pdb',segs[1]+'.fasta')
It might be something like this; a repeatable example (e.g., with short files posted on-line) would help...
library(Biostrings)
pdb = readAAStringSet("pdb.fasta")
uniprot = readAAStringSet("uniprot.fasta")
to input all sequences into two objects. pairwiseAlignment accepts a vector as first (query) argument, so if you were wanting to align all pdb against all uniprot pre-allocate a result matrix
pids = matrix(numeric(), length(uniprot), length(pdb),
dimnames=list(names(uniprot), names(pdb)))
and then do the calculations
for (i in seq_along(uniprot)) {
globalAlignment = pairwiseAlignment(pdb, uniprot[i])
pids[i,] = pid(globalAlignment)
}