How to select with two conditions using XPath - web-scraping

//a[contains(#class,'inprogress')] - selects active matches
//span[contains(#itemprop,'name')] - selects all matches
How do I select only matches that aren't active? (atctives are red colored)
https://www.fudbal91.com/previews/2022-03-30

You can use not() like
//a[not(contains(#class,'inprogress'))]
And if you want use both then use then both together
//a[not(contains(#class,'inprogress'))]//span[contains(#itemprop,'name')]
from selenium import webdriver
from selenium.webdriver.common.by import By
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
import time
url = 'https://www.fudbal91.com/previews/2022-03-30'
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
time.sleep(2)
all_items = driver.find_elements(By.XPATH, '//a[not(contains(#class,"inprogress"))]//span[contains(#itemprop,"name")]')
print('len(all_items):', len(all_items))
for item in all_items:
print(item.text)

Related

How to get data from app PowerBI as external user

I am interested in downloading data from a national public dataset referring to Vaccine in Italy.
https://app.powerbi.com/view?r=eyJrIjoiMzg4YmI5NDQtZDM5ZC00ZTIyLTgxN2MtOTBkMWM4MTUyYTg0IiwidCI6ImFmZDBhNzVjLTg2NzEtNGNjZS05MDYxLTJjYTBkOTJlNDIyZiIsImMiOjh9
In particular I am interested in downloading the last table.
I tried to use a scraping model via HTML, but it seems that data is not stored directly into the HTML source page.
Then I thought to use the code below in Python 3.9:
import pytest
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import pyautogui
import win32api
import win32gui
from win32con import *
driver = webdriver.Chrome()
LINK = ".tableEx .bodyCells div:nth-child(1) > .pivotTableCellWrap:nth-child("
TO_ADD ="1)"
data = []
action = ActionChains(driver)
driver.get("https://app.powerbi.com/view?r=eyJrIjoiMzg4YmI5NDQtZDM5ZC00ZTIyLTgxN2MtOTBkMWM4MTUyYTg0IiwidCI6ImFmZDBhNzVjLTg2NzEtNGNjZS05MDYxLTJjYTBkOTJlNDIyZiIsImMiOjh9")
driver.set_window_size(784, 835)
driver.execute_script("window.scrollTo(0,0)")
driver.execute_script("window.scrollTo(0,0)")
for i in range(1,293):
print(i)
if i%10==0:
win32api.SetCursorPos((1625,724))
win32api.mouse_event(MOUSEEVENTF_WHEEL, x, y, -3, 0)
time.sleep(6)
else:
pass
action = ActionChains(driver)
TO_ADD = str(i)+")"
action.move_to_element(driver.find_element(By.CSS_SELECTOR, LINK+TO_ADD)).perform()
action.context_click().send_keys(Keys.ARROW_DOWN).send_keys(Keys.ARROW_DOWN).perform()
time.sleep(0.5)
x,y = pyautogui.locateCenterOnScreen(r'C:\Users\migli\Documents\Learning\copy.png')
pyautogui.moveTo(x,y,0.1)
pyautogui.click()
time.sleep(0.5)
x,y = pyautogui.locateCenterOnScreen(r'C:\Users\migli\Documents\Learning\copiaselez.png')
pyautogui.moveTo(x,y,0.1)
pyautogui.click()
data.append(pyperclip.paste())
Images for clicking:
copiaselez
copy
This seems to achieve what I am trying to do. But then it blocks around the 14th cycle. I don't know why. Maybe I should scroll down the page in some manner, but I tried to do it manually during the code inserting an sleep time around 10th cycle, but it gets error as well.
I also thought using an API but it seems not to exist one.
Any idea is accepted.

Web scraping for hidden content

I am trying to scrape the price data from this website: https://fuelkaki.sg/home
However, the data does not appear to be in the HTML code of the page. Upon inspecting, the data seems to be nested in the tag, for instance under Caltex for the retailer name, and similarly under multiple nested tags for the price data, which I am unable to scrape with the following code (there are no results to be found).
Any help would be much appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://fuelkaki.sg/home'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find('div', class_='fuel-name')
The table is behind JS (JavaScript) so BeautifulSoup won't see it.
Here's how I'd do it:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)
url = "https://fuelkaki.sg/home"
driver.get(url)
time.sleep(3)
element = driver.find_element_by_xpath('//*[#class="table"]')
print(element.text)
driver.close()
Output:
Diesel
92
95
98
Others
(V-Power, etc)
Caltex
3‬‬‬‬‬‬0‬‬‬‬‬‬ ‬‬‬‬‬‬S‬‬‬‬‬‬e‬‬‬‬‬‬p‬‬‬‬‬‬t‬‬‬‬‬‬e‬‬‬‬‬‬m‬‬‬‬‬‬b‬‬‬‬‬‬e‬‬‬‬‬‬r‬‬‬‬‬‬ ‬‬‬‬‬‬2‬‬‬‬‬‬0‬‬‬‬‬‬2‬‬‬‬‬‬0‬‬‬‬‬‬,‬‬‬‬‬‬ ‬‬‬‬‬‬0‬‬‬‬‬‬2‬‬‬‬‬‬:‬‬‬‬‬‬0‬‬‬‬‬‬5‬‬‬‬‬‬p‬‬‬‬‬‬m
S$ 1‬‬‬‬.‬‬‬‬7‬‬‬‬3
S$ 2‬‬‬.‬‬‬0‬‬‬2
S$ 2‬‬‬.‬‬‬0‬‬‬6
N.A.
S$ 2‬‬‬‬‬‬.‬‬‬‬‬‬6‬‬‬‬‬‬1
and so on...
EDIT:
If you want the table in a Dataframe try this:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)
url = "https://fuelkaki.sg/home"
driver.get(url)
time.sleep(3)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser").select_one(".table")
df = pd.read_html(str(soup))
df = pd.concat(df).rename(columns={"Unnamed: 0": ""})
df.to_csv("fuel_data.csv", index=False)
driver.close()
Outputs a .csv file with the table's data:

For Loop Not Repeating

I'm trying to find the hrefs for all the states where this company has stores, however it only finds the href for the first state.
Can anyone figure out why the for loop doesn't repeat for the rest of the states? Thank you very much for your help!
import requests
from bs4 import BeautifulSoup
import csv
# website
sitemap = 'website_url'
# content of website
sitemap_content = requests.get(sitemap).content
# parsing website
soup = BeautifulSoup(sitemap_content, 'html.parser')
#print(soup)
list_of_divs = soup.findAll('div', attrs={'class':'listings-inner'})
#print(list_of_divs)
header = ['Links']
with open ('/Users/ABC/Desktop/v1.csv','wt') as csvfile:
writer = csv.writer(csvfile, delimiter ="\t" )
writer.writerow(header)
for state in list_of_divs:
# get the url's by state
print(state.find('div', attrs={'class':'itemlist'}).a.get('href'))
rows = [state.find('div', attrs={'class':'itemlist'}).a.get('href')]
writer.writerow(rows)
list_of_divs actually only contains one element, which is the only div on the page with class listings-inner. So when you iterate through all of it's elements and use the find method, it'll only return the first result.
You want to use the find_all method on that div:
import requests
from bs4 import BeautifulSoup
sitemap = 'https://stores.dollargeneral.com/'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
listings_div = soup.find('div', attrs={'class':'listings-inner'})
for state in listings_div.find_all('div', attrs={'class':'itemlist'}):
print(state.a.get('href'))

Unable to get response in css selector or xpath expression

I am trying to get the movie names from https://www.sunnxt.com/movie/inside/ website. When I go to inspect element it shows me elements with movie names, but when I perform css selector or xpath expression, it did not give the movie name.
When I click on view source, I saw the code is different there. and all the movie data is placed between <script></script> tag.
Please help me to get all movie name.
To retrieve the movie names you have to induce WebDriverWait inconjunction with expected_conditions as visibility_of_all_elements_located as follows :
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver=webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
driver.get("https://www.sunnxt.com/movie/inside/")
movieList = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//h2[#class='title']//following::div[#class='home_movie_list_wrap']//a//h2")))
for item in movieList:
print(item.text)
driver.quit()
Console Output :
Gulaebaghavali
KALAKALAPPU 2
Motta Shiva Ketta Shiva
Annadurai
Aramm
Kaththi Sandai
Meesaya Murukku
Spyder
Sathriyan
Bogan
Brindavanam
Vivegam
Bairavaa
Karuppan
Muthina Kathirika
Dharmadurai
Thozha
Pichaikkaran
Devi
Aranmanai 2
Jackson Durai
Hello Naan Pei Pesuren
Kathakali
Kodi

How To Remove White Space in Scrapy Spider Data

I am writing my first spider in Scrapy and attempting to follow the documentation. I have implemented ItemLoaders. The spider extracts the data, but the data contains many line returns. I have tried many ways to remove them, but nothing seems to work. The replace_escape_chars utility is supposed to work, but I can't figure out how to use it with the ItemLoader. Also some people use (unicode.strip), but again, I can't seem to get it to work. Some people try to use these in items.py and others in the spider. How can I clean the data of these line returns (\r\n)? My items.py file only contains the item names and field(). The spider code is below:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.utils.markup import replace_escape_chars
from ccpstore.items import Greenhouse
class GreenhouseSpider(BaseSpider):
name = "greenhouse"
allowed_domains = ["domain.com"]
start_urls = [
"http://www.domain.com",
]
def parse(self, response):
items = []
l = XPathItemLoader(item=Greenhouse(), response=response)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('title', '//h1')
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]')
items.append(l.load_item())
return items
You can use the default_output_processor on the loader and also other processors on individual fields, see title:
from scrapy.spider import BaseSpider
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Compose, MapCompose
from w3lib.html import replace_escape_chars, remove_tags
from ccpstore.items import Greenhouse
class GreenhouseSpider(BaseSpider):
name = "greenhouse"
allowed_domains = ["domain.com"]
start_urls = ["http://www.domain.com"]
def parse(self, response):
l = XPathItemLoader(Greenhouse(), response=response)
l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('title', '//h1', Compose(remove_tags))
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]')
return l.load_item()
It turns out that there were also many blank spaces in the data, so combining the answer of Steven with some more research allowed the data to have all tags, line returns and duplicate spaces removed. The working code is below. Note the addition of text() on the loader lines which removes the tags and the split and join processors to remove spaces and line returns.
def parse(self, response):
items = []
l = XPathItemLoader(item=Greenhouse(), response=response)
l.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
l.default_output_processor = Join()
l.add_xpath('title', '//h1/text()')
l.add_xpath('usage', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl00_liItem"]/text()')
l.add_xpath('repeat', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl02_liItem"]/text()')
l.add_xpath('direction', '//li[#id="ctl18_ctl00_rptProductAttributes_ctl03_liItem"]/text()')
items.append(l.load_item())
return items

Resources