Unable to scrape all photos even after using selenium to automate scroll - web-scraping

I am a newbie to web scraping. I am currently working on a project where I want to scrape all the photos of an instagram user. The user has 521 posts in total due to which I had used selenium to scroll down till the bottom of the profile. But I am still able to scrape only the first 37 photos. After further inspecting, I found that as the browser scrolls up or down, only the first few rows of img tags are visible in the source code. As I scroll more, the previously visible img tags disappear and only the next rows are visible. So only a certain no. of rows are visible in the html code at any instant. I doubt it to be the reason why I am able to scrape only the first 37 photos.
I want to know how I can scrape all the photos of the profile. Below I have mentioned my current code using Beautiful Soup and Selenium. Here, "scroll_down" function uses selenium to scroll down till the bottom of the profile. I am trying to scrape all the 'img' tags in the function "downloading_images", but as already mentioned, I am able to scrape only first 37 photos.
def downloading_images(self):
soup = BeautifulSoup(self.driver.page_source,'html.parser')
self.all_images = soup.findAll('img')
print(len(self.all_images))
for index,image in enumerate(self.all_images):
filename = "image_" + str(index) + ".jpg"
image_path = os.path.join(self.path,filename)
link = image['src']
print("Downloading image ", index)
response = requests.get(link,stream = True)
try:
with open(image_path,'wb') as file:
shutil.copyfileobj(response.raw,file)
except Exception as e:
print(e)
print('Could not download image no.', index)
print('Image link',link)
def scroll_down(self):
sleep(3)
try:
num_posts = self.driver.find_element_by_xpath('//span[text()[contains(.," posts")]]/span[#class="g47SY "]')
str_num_posts = str(num_posts.text).replace(',','')
self.int_num_posts = int(str_num_posts)
if self.int_num_posts > 12:
num_scrolls = int(self.int_num_posts/12) + 3
print(num_scrolls)
sleep(3)
try:
for win in range(num_scrolls):
print(win)
self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
sleep(3)
except Exception as e:
self.error = True
print(e)
except Exception as e:
self.error = True
print(e)
I searched for all relevant questions here, but none of them could help me understand how I can fetch those images from the code which keeps disappearing upon scrolling.
Hope my question is clear. Thanks in advance.
Edit: Ok, I tried to scrape upon every scroll and it seems to work. Here is my new code.
def downloading_images(self):
print(len(self.all_images))
for index,image in enumerate(self.all_images):
filename = "image_" + str(index) + ".jpg"
image_path = os.path.join(self.path,filename)
link = image['src']
print("Downloading image ", index)
response = requests.get(link,stream = True)
try:
with open(image_path,'wb') as file:
shutil.copyfileobj(response.raw,file)
except Exception as e:
print(e)
print('Could not download image no.', index)
print('Image link',link)
def scroll_down(self):
sleep(3)
try:
num_posts = self.driver.find_element_by_xpath('//span[text()[contains(.," posts")]]/span[#class="g47SY "]')
str_num_posts = str(num_posts.text).replace(',','')
self.int_num_posts = int(str_num_posts)
if self.int_num_posts > 12:
num_scrolls = int(self.int_num_posts/12) + 1
else:
num_scrolls = self.int_num_posts
print(num_scrolls)
sleep(3)
try:
soup = BeautifulSoup(self.driver.page_source,'html.parser')
images = soup.findAll('img')
self.all_images = images
last_height = self.driver.execute_script("return document.body.scrollHeight")
for win in range(num_scrolls):
print(win)
self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
sleep(3)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
soup = BeautifulSoup(self.driver.page_source,'html.parser')
images = soup.findAll('img')
self.all_images.extend(images[-12:])
last_height = new_height
print(len(self.all_images))
except Exception as e:
self.error = True
print(e)
except Exception as e:
self.error = True
print(e)
def search_target(self):
try:
search_bar = self.driver.find_element_by_xpath('//input[#class="XTCLo x3qfX "]')
search_bar.send_keys(self.target_username)
taget_profile_url = self.main_url + '/' + self.target_username + '/'
self.driver.get(taget_profile_url)
except Exception as e:
self.error = True
print(e)
def close_notify_box(self):
try:
sleep(3)
not_now_button = self.driver.find_element_by_xpath('//button[#class="aOOlW HoLwm "]')
not_now_button.click()
except Exception:
pass
def log_in(self):
try:
log_in_button = self.driver.find_element_by_xpath('//p[#class="izU2O"]/a')
log_in_button.click()
sleep(3)
user_name_input = self.driver.find_element_by_xpath('//input[#aria-label="Phone number, username, or email"]')
user_name_input.send_keys(self.username)
password_input = self.driver.find_element_by_xpath('//input[#aria-label="Password"]')
password_input.send_keys(self.password)
password_input.submit()
except Exception as e:
self.error = True
print(e)
I would like to know if there are any alternative solutions to this. And whether it is an efficient solution. Thank you.

Related

How to stop the Crawler

I am trying to write a crawler that goes to a website and searches for a list of keywords, with max_Depth of 2. But the scraper is supposed to stop once any of the keyword's appears on any page, the problem i am facing right now is that the crawler does-not stop when it first see's any of the keywords.
Even after trying, early return command, break command and CloseSpider Commands and even python exit commands.
My class of the Crawler:
class WebsiteSpider(CrawlSpider):
name = "webcrawler"
allowed_domains = ["www.roomtoread.org"]
start_urls = ["https://"+"www.roomtoread.org"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
wordlist = [
"sfdc",
"pardot",
"Web-to-Lead",
"salesforce"
]
url = response.url
contenttype = response.headers.get("content-type", "").decode('utf-8').lower()
data = response.body.decode('utf-8')
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
if self.__class__.words_found==0:
self.__class__.words_found += 1
print(word + "," + url + ";")
STOP!
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want it to stop execution when if not ok: is True.
When I want to stop a spider, I usually use the exception exception scrapy.exceptions.CloseSpider(reason='cancelled') from Scrapy-Docs.
The example there shows how you can use it:
if 'Bandwidth exceeded' in response.body:
raise CloseSpider('bandwidth_exceeded')
In your case something like
if not ok:
raise CloseSpider('keyword_found')
Or is that what you meant with
CloseSpider Commands
and already tried it?

Can't fetch the content of articles using beautifulsoup in python 3.7

I am doing web-scraping using beautifulsoup in python 3.7. The code below is successfully scraping date, title, tags but not the content of the articles. It is giving None instead.
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page={}'
pages = 32
for page in range(4, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "story-card75x1-text"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.thehindu.com"+_href)
except Exception as e:
continue
dateTag = soup.find("span", {"class": "dateline"})
sauce = BeautifulSoup(resp.text,"lxml")
tag = sauce.find("a", {"class": "section-name"})
titleTag = sauce.find("h1", {"class": "title"})
contentTag = sauce.find("div", {"class": "_yeti_done"})
date = None
tagName = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(tag,Tag):
tagName = tag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {tagName}\n {title}\n {content}\n')
time.sleep(3)
I don't see where is the problem as I am writing the correct class in contentTag.
Thanks.
I guess the links you would like to follow from first page to it's inner page end with .ece. I've applied that logic within the script to traverse those target pages to scrape data from. I've defined selectors for content slightly differently. Now it appears to be working correctly. The following script only scrapes data from page 1. Feel free to change it as per your requirement.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page=1'
base = "https://www.thehindu.com"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".story-card-news a[href$='.ece']"):
resp = requests.get(urljoin(base,item.get("href")))
sauce = BeautifulSoup(resp.text,"lxml")
title = item.get_text(strip=True)
content = ' '.join([item.get_text(strip=True) for item in sauce.select("[id^='content-body-'] p")])
print(f'{title}\n {content}\n')

How to collect content from articles in USA Today newspaper using Beautifulsoup in Python 3.7?

I am collecting date, headline, and content from USA Today Newspaper. I can able to get Date, headline and even content but along with content, I am getting some unwanted stuff. I don't know what I should change in my code to get only the content (article)?
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.usatoday.com/search/?q=cybersecurity&page={}'
pages = 72
for page in range(1, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "gnt_se_a"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.usatoday.com"+_href)
except Exception as e:
continue
sauce = BeautifulSoup(resp.text,"lxml")
dateTag = sauce.find("span",{"class": "asset-metabar-time asset-metabar-item nobyline"})
titleTag = sauce.find("h1", {"class": "asset-headline speakable-headline"})
contentTag = sauce.find("div", {"class": "asset-double-wide double-wide p402_premium"})
date = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {title}\n {content}\n')
time.sleep(3)
I am expecting date, headline, and content from each article.
I try to find content by
contentTag = sauce.find_all('p',{"class": "p-text"})
and condition for content is
if isinstance(contentTag,list):
content = []
for c in contentTag:
content.append(c.get_text().strip())
content = ' '.join(content)
It works.

xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 1, column 0

I'm trying to parse a directory with a collection of xml files from RSS feeds.
I have a similar code for another directory working fine, so I can't figure out the problem. I want to return the items so I can write them to a CSV file. The error I'm getting is:
xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 1, column 0
Here is the site I've collected RSS feeds from: https://www.ba.no/service/rss
It worked fine for: https://www.nrk.no/toppsaker.rss and https://www.vg.no/rss/feed/?limit=10&format=rss&categories=&keywords=
Here is the function for this RSS:
import os
import xml.etree.ElementTree as ET
import csv
def baitem():
basepath = "../data_copy/bergens_avisen"
table = []
for fname in os.listdir(basepath):
if fname != "last_feed.xml":
files = ET.parse(os.path.join(basepath, fname))
root = files.getroot()
items = root.find("channel").findall("item")
#print(items)
for item in items:
date = item.find("pubDate").text
title = item.find("title").text
description = item.find("description").text
link = item.find("link").text
table.append((date, title, description, link))
return table
I tested with print(items) and it returns all the objects.
Can it be how the XML files are written?
Asked a friend and said to test with a try except statement. Found a .DS_Store file, which only applies to Mac computers. I'm providing the solution for those who might experience the same problem in the future.
def baitem():
basepath = "../data_copy/bergens_avisen"
table = []
for fname in os.listdir(basepath):
try:
if fname != "last_feed.xml" and fname != ".DS_Store":
files = ET.parse(os.path.join(basepath, fname))
root = files.getroot()
items = root.find("channel").findall("item")
for item in items:
date = item.find("pubDate").text
title = item.find("title").text
description = item.find("description").text
link = item.find("link").text
table.append((date, title, description, link))
except Exception as e:
print(fname, e)
return table

Python webdriver switch from main window to popup screen (not Java alert) and login

Here is the window I need to enter new password and repeat it again and click 'create'.
My code so far:
createLogin = wait.until(EC.presence_of_element_located((By.XPATH, '//*[#id="Item.MessageUniqueBody"]/div/div/div/div/div[2]/div[2]/a')))
createLogin.click()
time.sleep(10)
try:
newPassword = self.driver.find_elements_by_xpath('//*[#id="editNewUser_newPassword"]')
newPassword1 = self.driver.find_elements_by_xpath('//*[#id="editNewUser_newPasswordRepeat"]')
newPasswordForm = self.driver.find_elements_by_xpath('//*[#id="editNewUserPasswordForm"]/table/tbody/tr[1]/td[1]')
self.driver.switch_to.active_element(newPasswordForm)
time.sleep(3)
newPassword.send_keys('123')
newPassword1.send_keys('123')
time.sleep(2)
# createLog = wait.until(
# EC.presence_of_element_located((By.XPATH, '//*[#id="editNewUserPassword_save"]')))
# createLog.click()
# time.sleep(5)
except NoAlertPresentException as e:
time.sleep(2)
myAccount = wait.until(
EC.presence_of_element_located((By.XPATH, '//*[#id="easMyAccount1"]')))
myAccount.click()
time.sleep(5)
This is the problem.
You are using find_elements_by_xpath rather than find_element_by_xpath
plural vs singular.
find_elements_by_xpath: it gives you a list of web elements with matching identifier.
find_element_by_xpath: it gives you a first web element with matching identifier.
newPassword = self.driver.find_element_by_xpath('//*[#id="editNewUser_newPassword"]')
newPassword1 = self.driver.find_element_by_xpath('//*[#id="editNewUser_newPasswordRepeat"]')
newPasswordForm = self.driver.find_element_by_xpath('//*[#id="editNewUserPasswordForm"]/table/tbody/tr[1]/td[1]')
#gauurang Answer is right, But You have to used find_element_by_xpath, also as your xpath suggested you have id to locate the webelements so it is always better to use id over xpath
Also your xpath are correct
newPassword = self.driver.find_element_by_id('editNewUser_newPassword')
newPassword1 = self.driver.find_element_by_id('editNewUser_newPasswordRepeat')
newPasswordForm = self.driver.find_element_by_xpath('//*[#id="editNewUserPasswordForm"]/table/tbody/tr[1]/td[1]')

Resources