pageToken while scraping Youtube comments in R - r

I am trying to generate a dataset of comments from a Youtube video and am having trouble looping over the pageToken using the Google API. Below is a snippet of code. Why doesn't the 'while' loop work?
base_url <- "https://www.googleapis.com/youtube/v3/commentThreads/"
data = "list"
api_opts <- list(
part = "snippet",
maxResults = 100,
textFormat = "plainText",
videoId = "N708P-A45D0", # This is an example of a video id
key = "google developer key goes here",
fields = "items,nextPageToken",
orderBy = "published")
init_results <- httr::content(httr::GET(base_url, query = api_opts))
data <- init_results$items
api_opts$pageToken <- init_results$nextPageToken
api_opts$pageToken <- gsub("\\=", "", init_results$nextPageToken)
print(api_opts$pageToken)
while (api_opts$pageToken != "") {
print(api_opts$pageToken)
next_results <- httr::content(httr::GET(base_url, query = api_opts))
data <- c(data, next_results$items)
api_opts$pageToken <- gsub("\\=", "", next_results$nextPageToken)
}
organize_data = function(){
sub_data <- lapply(data, function(x) {
data.frame(
Comment = x$snippet$topLevelComment$snippet$textDisplay,
User = x$snippet$topLevelComment$snippet$authorDisplayName,
ReplyCount = x$snippet$totalReplyCount,
LikeCount = x$snippet$topLevelComment$snippet$likeCount,
PublishTime = x$snippet$topLevelComment$snippet$publishedAt,
CommentId = x$snippet$topLevelComment$id,
stringsAsFactors=FALSE)
})
}
sample <- organize_data()
L <- length(sample)
sample <- data.frame(matrix(unlist(sample), nrow=L, byrow=T))
colnames(sample) <- c("Comment", "User", "ReplyCount", "LikeCount", "PublishTime", "CommentId")
head(sample)

Thanks for looking, in case anyone else has this problem in the future, below is what I did to fix this problem. I still can't get the replies to the replies.
####
# NEW TRY
# Note: according to YouTube "YouTube currently supports replies only for top-level comments. However, replies to replies may be supported in the future."
####
rm(list=ls())
data = "list"
# Initialize
init_results <- httr::content(httr::GET("https://www.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&maxResults=100&textFormat=plainText&videoId=N708P-A45D0&fields=items%2CnextPageToken&key=[my google developer key]"))
data <- init_results$items
init_results$nextPageToken
print(init_results$nextPageToken)
# Begin loop
while (init_results$nextPageToken != ""){
# Make the page token URL encoded
api_opts_pageToken <- gsub("=", "%3D", init_results$nextPageToken)
# Write the call with the updated page token
get_call <- gsub("api_pageToken", api_opts_pageToken, "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&maxResults=100&pageToken=api_pageToken&textFormat=plainText&videoId=N708P-A45D0&fields=items%2CnextPageToken&key==[my google developer key]")
# Pull out the data from this page token call
next_results <- httr::content(httr::GET(get_call))
# Update the datafile
data <- c(data,next_results$items)
# Update the page token
print(next_results$nextPageToken)
init_results$nextPageToken <- next_results$nextPageToken
}
organize_data = function(){
sub_data <- lapply(data, function(x) {
data.frame(
Comment = x$snippet$topLevelComment$snippet$textDisplay,
User = x$snippet$topLevelComment$snippet$authorDisplayName,
ReplyCount = x$snippet$totalReplyCount,
LikeCount = x$snippet$topLevelComment$snippet$likeCount,
PublishTime = x$snippet$topLevelComment$snippet$publishedAt,
CommentId = x$snippet$topLevelComment$id,
stringsAsFactors=FALSE)
})
}
sample <- organize_data()
L <- length(sample)
sample <- data.frame(matrix(unlist(sample), nrow=L, byrow=T))
colnames(sample) <- c("Comment", "User", "ReplyCount", "LikeCount", "PublishTime", "CommentId")
head(sample)
dim(sample)

Related

subscript out of bounds for list! R

I am learning web scraping and have been facing one hurdle after another. I want to create a data frame full of the first table on this page for all portfolio managers, for the month of august, the year 2022.
So far, I have found a way to scrape a single table properly (I think! Please let me know if I can improve on this).
I haven't been able to bind all the tables into a data frame properly, also I wanted to find out if there is a way to transform this form type data into a proper data frame with the 1st column of every table as the variable and the second column as the row (I know I can use the usual data wrangling thing but I wanted to know if some function helped transform this form type data into a data frame).
> library(tidyverse)
> library(rvest)
> library(httr)
> url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
> pm_id <- read_html(url) %>%
+ html_elements('select[name="pmrId"].f_control option') %>%
+ html_attr("value")
> pm_id <- pm_id[2:416]
> sebi_pm <- function(x) {
+ resp = POST(url,
+ body = list(
+ pmrId= x,
+ year="2022",
+ m .... [TRUNCATED]
> #s <- lapply(pm_id[i], sebi_pm)
> #v <- sebi_pm(pm_id[1])
> #v
> #do.call() lapply(pm_id[1:5], sebi_pm)
> ha <- do.call("rbind", lapply(pm_id, sebi_ .... [TRUNCATED]
#> Error in .[[1]] : subscript out of bounds
Normally I would be a stickler for a reproducible example, but I think I know what you're getting at here... try this...
# DEPENDENCIES -----------------------------------------------------------------
library(rvest)
library(httr)
library(stringr)
library(data.table)
# UTILITY FUNCTIONS ------------------------------------------------------------
get_pm_ids <- function() {
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
# get list of portfolio manager ids
pm_ids <- read_html(url) |>
html_elements('select[name="pmrId"].f_control option') |>
html_attr('value')
pm_ids
}
get_monthly_report <- function(pmr_id, report_year, report_month) {
msg <- sprintf('fetching report for portfolio manager: %s; year = %s; month = %s',
str_split(pmr_id, '##', simplify = TRUE)[ , 3] |> str_squish(),
report_year,
report_month)
message(msg)
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
params <- list(
currdate = '',
loginflag = 0,
searchValue = '',
pmrId = pmr_id,
year = report_year,
month = report_month,
loginEmail = '',
loginPassword = '',
cap_login = '',
moduleNo = -1,
moduleId = '',
link = '',
yourName = '',
friendName = '',
friendEmail = '',
mailmessage = '',
cap_email = ''
)
resp <- POST(url, body = params)
pg <- httr::content(resp)
tbl <- html_nodes(pg, 'div.portlet:nth-child(3) > div:nth-child(1) > table:nth-child(1)')
result_df <- data.frame()
if (length(tbl) == 0) {
# no records found
result_df <- data.frame(id = pmr_id,
report_year = report_year,
report_month = report_month)
} else {
tr <- html_nodes(tbl, 'tr')
cell_captions <- lapply(tr, html_children) |> lapply('[', 1) |> lapply(html_text) |> unlist()
cell_contents <- lapply(tr, html_children) |> lapply('[', 2) |> lapply(html_text) |> unlist()
result_df <- data.frame(t(cell_contents))
colnames(result_df) <- cell_captions
result_df$id <- pmr_id
result_df$report_year <- report_year
result_df$report_month <- report_month
}
return(result_df)
}
# MAIN -------------------------------------------------------------------------
## 1. fetch list of portfolio manager ids --------------------------------------
pm_ids <- get_pm_ids()
## 2. filter list of portfolio manager ids -------------------------------------
pm_ids <- pm_ids[ 2:416 ]
## 3. testing: fetch reports for a sample of managers in January 2022 ----------
set.seed(1234)
tmp <- sample(pm_ids, 5)
reports_list <- lapply(tmp, get_monthly_report, 2022, 1)
## 4. combine the results ------------------------------------------------------
reports_df <- rbindlist(reports_list, use.names = TRUE, fill = TRUE) |>
as.data.frame()
## 5. inspect results ----------------------------------------------------------
View(reports_df, 'downloaded reports')
This code could be improved by providing some kind of input validation and more robust error handling. Hope this helps!

web-scraping from a website that does not change URL

I am very new to web-scraping, and I am having some difficulty scraping this website's content. I basic would like to collect the pesticide name and active ingredient, but the URL does not change, and I could not find a way to click the grids. Any help?
library(RSelenium)
library(rvest)
library(tidyverse)
rD <- rsDriver(browser="firefox", port=4547L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("http://www.cdms.net/Label-Database")
This site calls an API to get the list of manufacturers: http://www.cdms.net/labelssds/Home/ManList?Keys=
On the products page, it also uses another API with the manufacturer ID, for example: http://www.cdms.net/labelssds/Home/ProductList?manId=537
You just need to loop through the Lst array and append the result to a dataframe.
For instance, the following code get all the products for the first 5 manufacturers :
library(httr)
manufacturers <- content(GET("http://www.cdms.net/labelssds/Home/ManList?Keys="), as = "parsed", type = "application/json")
maxManufacturer <- 5
index <- 1
manufacturerCount <- 0
data = list()
for(m in manufacturers$Lst){
print(m$label)
productUrl <- modify_url("http://www.cdms.net/labelssds/Home/ProductList",
query = list(
"manId" = m$value
)
)
products <- content(GET(productUrl), as = "parsed", type = "application/json")
for(p in products$Lst){
data[[index]] = p
index <- index + 1
}
manufacturerCount <- manufacturerCount + 1
if (manufacturerCount == maxManufacturer){
break
}
Sys.sleep(0.500) #add delay for scraping
}
df <- do.call(rbind, data)
options(width = 1200)
print(df)

Google Search in R [duplicate]

I used the following code:
library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term)
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
getGoogleLinks <- function(google.url)
{
doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
nodes <- getNodeSet(html, "//a[#href][#class='l']")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
}
search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
links <- getGoogleLinks(search.url)
I would like to find all the links that resulted from my search and I get the following result:
> links
list()
How can I get the links?
In addition I would like to get the headlines and summary of google results how can I get it?
And finally is there a way to get the links that resides in ChillingEffects.org results?
If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.
Try to change your getGoogleLinks function to:
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[#class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.
#libraries.
library(URLencode)
library(rvest)
#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)
# Function for getting website.
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result <- results[1]
return(as.character(result)) # Return results if you want to see them all.
}
# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]
other solutions here don't work for me, here's my take on #Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R
# install.packages("rvest")
get_first_google_link <- function(name, root = TRUE) {
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- xml2::read_html(url)
# extract all links
nodes <- rvest::html_nodes(page, "a")
links <- rvest::html_attr(nodes,"href")
# extract first link of the search results
link <- links[startsWith(links, "/url?q=")][1]
# clean it
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
# get root if relevant
if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
link
}
companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#> company url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2 abbvie inc https://www.abbvie.com/
#> 3 apple inc https://www.apple.com/
Created on 2019-08-10 by the reprex package (v0.2.1)
The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.
Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
# keyword = keywords[10]; country = "us"
url <- paste0("https://www.googleapis.com/customsearch/v1?"
, "key=", google.key
, "&q=", gsub(" ", "+", keyword)
, "&gl=", country # Country
, "&hl=en" # Language from Browser, english
, "&cx=", google.cx
, "&fields=items(link)"
)
d2 <- url %>%
httr::GET(ssl.verifypeer=TRUE) %>%
httr::content(.) %>% .[["items"]] %>%
data.table::rbindlist(.) %>%
mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
rename(source = link) %>%
select(search.engine, keyword, SERP, source)
pause <- round(runif(1, min = 1.1, max = 5), 1)
if(nrow(d2) == 0)
{cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
{cat("\nPausing", pause, "seconds. Successful for:", keyword)}
Sys.sleep(pause)
rm(keyword, country, pause, url, google.key, google.cx)
return(d2)
}

How to authenticate myself in Postman using API Key with Rscript

I'm trying to create an API connection with Postman, but i need to authenticate myself using Type = API Key. these are my credentials
My problem is because i don't know hot to add that cretentials to my Rscript to be able to access, this is my current code
library(httr)
library(tidyverse)
library(plyr)
# Settings
.proxy <- list(url = "gbiss-l-ss31.int.dir.witowa.com",
user = "svc-g-gad",
pwd = "5vcGBgaGaSrf",
port = 8090,
Header = list(Key <- 'X-EDS-USER',
Value <- 'B6E6685F-DB0C-438A-983F')
)
format_url_data <- function(x){
raw <- httr::GET(url = x,
httr::use_proxy(
url = .proxy$url,
port = .proxy$port,
username = .proxy$user,
password = .proxy$pwd
)
)
raw <- intToUtf8(raw$content)
jsonlite::fromJSON(raw)
}
############################ FOR FF (Don't change anything) ##################################################################
#Define Url
# Define Url
basehttr <- 'https://iat.eds.gateway-api.willistowerswatson.com/Clients/Search?query='
endhttr <- 'APPLE'
endhttr_backup <- endhttr
endhttr <- URLencode(endhttr)
url <- glue::glue('{basehttr}{endhttr}')
# Get the information and convert to df
dt <- tryCatch( {
dt <- purrr::map(url, ~format_url_data(.))},
error = function( error_condition ) {
basehttr <- "https://qa.eds.gateway-api.willistowerswatson.com/gateway-api/Clients/Search?query="
endhttr <- 'APPLE'
endhttr_backup <- endhttr
endhttr <- URLencode(endhttr)
url <- glue::glue('{basehttr}{endhttr}')
dt <- purrr::map(url, ~format_url_data(.))
}
)
A <- dt %>% as.list.data.frame()
B <- ldply (A, data.frame)
Data <- B %>%
drop_na(name) %>% as.data.frame()
Data$Name <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", Data$Name)
When i run my code get an Authentication error
Could you help me to connect with Postman with that credentials?
Thanks

Google trends scraping with R by Metro Area

I'm using the following code in R to download data from Google Trends, that I took mostly from here http://christophriedl.net/2013/08/22/google-trends-with-r/
############################################
## Query GoogleTrends from R
##
## by Christoph Riedl, Northeastern University
## Additional help and bug-fixing re cookies by
## Philippe Massicotte Université du Québec à Trois-Rivières (UQTR)
############################################
# Load required libraries
library(RCurl) # For getURL() and curl handler / cookie / google login
library(stringr) # For str_trim() to trip whitespace from strings
# Google account settings
username <- "USERNAME"
password <- "PASSWORD"
# URLs
loginURL <- "https://accounts.google.com/accounts/ServiceLogin"
authenticateURL <- "https://accounts.google.com/accounts/ServiceLoginAuth"
trendsURL <- "http://www.google.com/trends/TrendsRepport?"
############################################
## This gets the GALX cookie which we need to pass back with the login form
############################################
getGALX <- function(curl) {
txt = basicTextGatherer()
curlPerform( url=loginURL, curl=curl, writefunction=txt$update, header=TRUE, ssl.verifypeer=FALSE )
tmp <- txt$value()
val <- grep("Cookie: GALX", strsplit(tmp, "\n")[[1]], val = TRUE)
strsplit(val, "[:=;]")[[1]][3]
return( strsplit( val, "[:=;]")[[1]][3])
}
############################################
## Function to perform Google login and get cookies ready
############################################
gLogin <- function(username, password) {
ch <- getCurlHandle()
ans <- (curlSetOpt(curl = ch,
ssl.verifypeer = FALSE,
useragent = getOption('HTTPUserAgent', "R"),
timeout = 60,
followlocation = TRUE,
cookiejar = "./cookies",
cookiefile = ""))
galx <- getGALX(ch)
authenticatePage <- postForm(authenticateURL, .params=list(Email=username, Passwd=password, GALX=galx, PersistentCookie="yes", continue="http://www.google.com/trends"), curl=ch)
authenticatePage2 <- getURL("http://www.google.com", curl=ch)
if(getCurlInfo(ch)$response.code == 200) {
print("Google login successful!")
} else {
print("Google login failed!")
}
return(ch)
}
##
# returns string w/o leading or trailing whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
get_interest_over_time <- function(res, clean.col.names = TRUE) {
# remove all text before "Interest over time" data block begins
data <- gsub(".*Interest over time", "", res)
# remove all text after "Interest over time" data block ends
data <- gsub("\n\n.*", "", data)
# convert "interest over time" data block into data.frame
data.df <- read.table(text = data, sep =",", header=TRUE)
# Split data range into to only end of week date
data.df$Week <- gsub(".*\\s-\\s", "", data.df$Week)
data.df$Week <- as.Date(data.df$Week)
# clean column names
if(clean.col.names == TRUE) colnames(data.df) <- gsub("\\.\\..*", "", colnames(data.df))
# return "interest over time" data.frame
return(data.df)
}
############################################
## Read data for a query
############################################
ch <- gLogin( username, password )
authenticatePage2 <- getURL("http://www.google.com", curl=ch)
res <- getForm(trendsURL, q="sugar", geo="US", content=1, export=1, graph="all_csv", curl=ch)
# Check if quota limit reached
if( grepl( "You have reached your quota limit", res ) ) {
stop( "Quota limit reached; You should wait a while and try again lateer" )
}
df <- get_interest_over_time(res)
head(df)
write.csv(df,"sugar.csv")
When I search just for the US, or any single country, everything works fine, but I need more disagreggated data, at Metropolitan Area. However, I cannot get those queries to work with this script. Whenever I do it, by typing, for example "US-IL" in the geo field, I get an error:
Error in read.table(text = data, sep = ",", header = TRUE) :
more columns than column names
The same happens if I try to do take a trend for a Metropolitan Area (using something like "US-IL-602" for Chicago, for example). Does anyone know how could I modify this script to make it work?
Thank you very much,
Brian.

Resources