Google trends scraping with R by Metro Area - r

I'm using the following code in R to download data from Google Trends, that I took mostly from here http://christophriedl.net/2013/08/22/google-trends-with-r/
############################################
## Query GoogleTrends from R
##
## by Christoph Riedl, Northeastern University
## Additional help and bug-fixing re cookies by
## Philippe Massicotte Université du Québec à Trois-Rivières (UQTR)
############################################
# Load required libraries
library(RCurl) # For getURL() and curl handler / cookie / google login
library(stringr) # For str_trim() to trip whitespace from strings
# Google account settings
username <- "USERNAME"
password <- "PASSWORD"
# URLs
loginURL <- "https://accounts.google.com/accounts/ServiceLogin"
authenticateURL <- "https://accounts.google.com/accounts/ServiceLoginAuth"
trendsURL <- "http://www.google.com/trends/TrendsRepport?"
############################################
## This gets the GALX cookie which we need to pass back with the login form
############################################
getGALX <- function(curl) {
txt = basicTextGatherer()
curlPerform( url=loginURL, curl=curl, writefunction=txt$update, header=TRUE, ssl.verifypeer=FALSE )
tmp <- txt$value()
val <- grep("Cookie: GALX", strsplit(tmp, "\n")[[1]], val = TRUE)
strsplit(val, "[:=;]")[[1]][3]
return( strsplit( val, "[:=;]")[[1]][3])
}
############################################
## Function to perform Google login and get cookies ready
############################################
gLogin <- function(username, password) {
ch <- getCurlHandle()
ans <- (curlSetOpt(curl = ch,
ssl.verifypeer = FALSE,
useragent = getOption('HTTPUserAgent', "R"),
timeout = 60,
followlocation = TRUE,
cookiejar = "./cookies",
cookiefile = ""))
galx <- getGALX(ch)
authenticatePage <- postForm(authenticateURL, .params=list(Email=username, Passwd=password, GALX=galx, PersistentCookie="yes", continue="http://www.google.com/trends"), curl=ch)
authenticatePage2 <- getURL("http://www.google.com", curl=ch)
if(getCurlInfo(ch)$response.code == 200) {
print("Google login successful!")
} else {
print("Google login failed!")
}
return(ch)
}
##
# returns string w/o leading or trailing whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
get_interest_over_time <- function(res, clean.col.names = TRUE) {
# remove all text before "Interest over time" data block begins
data <- gsub(".*Interest over time", "", res)
# remove all text after "Interest over time" data block ends
data <- gsub("\n\n.*", "", data)
# convert "interest over time" data block into data.frame
data.df <- read.table(text = data, sep =",", header=TRUE)
# Split data range into to only end of week date
data.df$Week <- gsub(".*\\s-\\s", "", data.df$Week)
data.df$Week <- as.Date(data.df$Week)
# clean column names
if(clean.col.names == TRUE) colnames(data.df) <- gsub("\\.\\..*", "", colnames(data.df))
# return "interest over time" data.frame
return(data.df)
}
############################################
## Read data for a query
############################################
ch <- gLogin( username, password )
authenticatePage2 <- getURL("http://www.google.com", curl=ch)
res <- getForm(trendsURL, q="sugar", geo="US", content=1, export=1, graph="all_csv", curl=ch)
# Check if quota limit reached
if( grepl( "You have reached your quota limit", res ) ) {
stop( "Quota limit reached; You should wait a while and try again lateer" )
}
df <- get_interest_over_time(res)
head(df)
write.csv(df,"sugar.csv")
When I search just for the US, or any single country, everything works fine, but I need more disagreggated data, at Metropolitan Area. However, I cannot get those queries to work with this script. Whenever I do it, by typing, for example "US-IL" in the geo field, I get an error:
Error in read.table(text = data, sep = ",", header = TRUE) :
more columns than column names
The same happens if I try to do take a trend for a Metropolitan Area (using something like "US-IL-602" for Chicago, for example). Does anyone know how could I modify this script to make it work?
Thank you very much,
Brian.

Related

How to fix subscript out of bounds in R from subsetting a list?

I am fairly new with R. I decided for my own learning process to scrape the tracks that my favorite radio station is playing and then add these songs to my spotify playlist. This way I can listen to the music of my favorite radio station without any advertising
What is going well?
I can scrape the songs and add a test song to my spotify playlist.
Where does it go wrong?
Via the spotify API I retrieve all information about the songs based on the artist and title. I only need the spotify:track:xxxxx part of every response I get back. When I try to extract the part spotify: track: (track uri) from every response I get the error: subscript out of bounds:
### Radio2 playlist scraper ###
#Loading packages#
install.packages("rvest")
library(rvest)
install.packages("dplyr")
library("dplyr")
install.packages("remotes")
remotes::install_github("charlie86/spotifyr")
library(spotifyr)
install.packages('knitr', dependencies = TRUE)
library(knitr)
install.packages("stringr")
library("stringr")
install.packages("jsonlite")
library("jsonlite")
library(jsonlite)
library(purrr)
library(data.table)
library(httr)
library(magrittr)
library(rvest)
library(ggplot2)
#Get playlist url #
url <- "https://www.nporadio2.nl/playlist"
#Read HTML code from pagen#
webpage <- read_html(url)
#Get Artist and Title#
artist <- html_nodes(webpage, '.fn-artist')
title <- html_nodes(webpage, '.fn-song')
#Artist and Title to text#
artist_text <- html_text(artist)
title_text <- html_text(title)
#Artist and Title to dataframe#
artiest <- as.data.frame(artist_text)
titel_text <- as.data.frame(title_text)
#Make one dataframe#
radioplaylist <- cbind(artiest$artist_text, titel_text$title_text)
radioplaylist <- as.data.frame(radioplaylist)
radioplaylist
#Rename columns#
colnames(radioplaylist)[1] <- "Artiest"
colnames(radioplaylist)[2] <- "Titel"
radioplaylist
#Remove duplicate songs#
radioplaylistuniek <- radioplaylist %>% distinct(Artiest, Titel, .keep_all = TRUE)
#Write to csv#
date <- Sys.Date()
date
write.csv(radioplaylistuniek, paste0("C://Users//Kantoor//Radio2playlists//playlist - ", date, ".csv"))
#Set spotify API#
Sys.setenv(SPOTIFY_CLIENT_ID = 'xxxxxxxxxxxxx')
Sys.setenv(SPOTIFY_CLIENT_SECRET = 'xxxxxxxxxxxx')
access_token <- get_spotify_access_token()
# Client and secret#
clientID <- "xxxxxxxxxxxxxxx"
secret <- "xxxxxxxxxxxxxx"
# Get access token and write this to authorization header #
response = POST(
'https://accounts.spotify.com/api/token',
accept_json(),
authenticate(clientID, secret),
body = list(grant_type = 'client_credentials'),
encode = 'form',
verbose()
)
token = content(response)$access_token
authorization.header = paste0("Bearer ", token)
# Generate URLS #
radioplaylistuniektest <- radioplaylistuniek[1:100,]
urls <- list(c("https://api.spotify.com/v1/search?q=track:")) %>% paste0(radioplaylistuniektest$Titel) %>% paste0(c("%20artist:")) %>% paste0(radioplaylistuniektest$Artiest) %>% paste(c("&type=track&limit=1"), sep = "")
# Get track information#
lijstwijk <- lapply(urls, GET, simplifyMatrix=TRUE, flatten=TRUE, config = add_headers(authorization = authorization.header))
# Get trackuri from each response#
lijstwijkuri <- lapply(lijstwijk, function(item) content(item, as="parsed")$tracks$items[[1]]$uri)
Error in content(item, as = "parsed")$tracks$items[[1]] :
subscript out of bounds
When I remove the track URI from the response for a few songs, lets say for the first 5, everything goes well:
# Generate URLS #
radioplaylistuniektest <- radioplaylistuniek[1:5,]
urls <- list(c("https://api.spotify.com/v1/search?q=track:")) %>% paste0(radioplaylistuniektest$Titel) %>% paste0(c("%20artist:")) %>% paste0(radioplaylistuniektest$Artiest) %>% paste(c("&type=track&limit=1"), sep = "")
# Get track information#
lijstwijk <- lapply(urls, GET, simplifyMatrix=TRUE, flatten=TRUE, config = add_headers(authorization = authorization.header))
# Get trackuri from each response#
lijstwijkuri <- lapply(lijstwijk, function(item) content(item, as="parsed")$tracks$items[[1]]$uri)
lijstwijkuri
[[1]]
[1] "spotify:track:5Xhqe9xu6bKRSqLj1mS1SB"
[[2]]
[1] "spotify:track:21YxK0klhpfLW8budkJaMF"
[[3]]
[1] "spotify:track:468OIV1LzYrm3rluVKl8AU"
[[4]]
[1] "spotify:track:3yDhZq8f17SmumVmEyCaRN"
[[5]]
[1] "spotify:track:0IseLavjQ32B5wxYxWeuw5"
How to fix the subscript out of bounds error?
What is going wrong? How can i fix the subscript out of bounds error for extracting the spotify:track:xxxx part from each response?
Got the solution. So for anyone who is curious. This is how i fixed it:
# Unlist results #
responses <- unlist(lapply(lijstwijk, paste, collapse=" "))
# Results to dataframe #
responsesdf <- as.data.frame(responses)
# Get spotify:track string#
uriperurl <- data.frame(uri = str_extract(responsesdf$responses, "(spotify:track:)\\w+"))

Google Search in R [duplicate]

I used the following code:
library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term)
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
getGoogleLinks <- function(google.url)
{
doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
nodes <- getNodeSet(html, "//a[#href][#class='l']")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
}
search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
links <- getGoogleLinks(search.url)
I would like to find all the links that resulted from my search and I get the following result:
> links
list()
How can I get the links?
In addition I would like to get the headlines and summary of google results how can I get it?
And finally is there a way to get the links that resides in ChillingEffects.org results?
If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.
Try to change your getGoogleLinks function to:
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[#class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.
#libraries.
library(URLencode)
library(rvest)
#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)
# Function for getting website.
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result <- results[1]
return(as.character(result)) # Return results if you want to see them all.
}
# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]
other solutions here don't work for me, here's my take on #Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R
# install.packages("rvest")
get_first_google_link <- function(name, root = TRUE) {
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- xml2::read_html(url)
# extract all links
nodes <- rvest::html_nodes(page, "a")
links <- rvest::html_attr(nodes,"href")
# extract first link of the search results
link <- links[startsWith(links, "/url?q=")][1]
# clean it
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
# get root if relevant
if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
link
}
companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#> company url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2 abbvie inc https://www.abbvie.com/
#> 3 apple inc https://www.apple.com/
Created on 2019-08-10 by the reprex package (v0.2.1)
The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.
Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
# keyword = keywords[10]; country = "us"
url <- paste0("https://www.googleapis.com/customsearch/v1?"
, "key=", google.key
, "&q=", gsub(" ", "+", keyword)
, "&gl=", country # Country
, "&hl=en" # Language from Browser, english
, "&cx=", google.cx
, "&fields=items(link)"
)
d2 <- url %>%
httr::GET(ssl.verifypeer=TRUE) %>%
httr::content(.) %>% .[["items"]] %>%
data.table::rbindlist(.) %>%
mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
rename(source = link) %>%
select(search.engine, keyword, SERP, source)
pause <- round(runif(1, min = 1.1, max = 5), 1)
if(nrow(d2) == 0)
{cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
{cat("\nPausing", pause, "seconds. Successful for:", keyword)}
Sys.sleep(pause)
rm(keyword, country, pause, url, google.key, google.cx)
return(d2)
}

How to authenticate myself in Postman using API Key with Rscript

I'm trying to create an API connection with Postman, but i need to authenticate myself using Type = API Key. these are my credentials
My problem is because i don't know hot to add that cretentials to my Rscript to be able to access, this is my current code
library(httr)
library(tidyverse)
library(plyr)
# Settings
.proxy <- list(url = "gbiss-l-ss31.int.dir.witowa.com",
user = "svc-g-gad",
pwd = "5vcGBgaGaSrf",
port = 8090,
Header = list(Key <- 'X-EDS-USER',
Value <- 'B6E6685F-DB0C-438A-983F')
)
format_url_data <- function(x){
raw <- httr::GET(url = x,
httr::use_proxy(
url = .proxy$url,
port = .proxy$port,
username = .proxy$user,
password = .proxy$pwd
)
)
raw <- intToUtf8(raw$content)
jsonlite::fromJSON(raw)
}
############################ FOR FF (Don't change anything) ##################################################################
#Define Url
# Define Url
basehttr <- 'https://iat.eds.gateway-api.willistowerswatson.com/Clients/Search?query='
endhttr <- 'APPLE'
endhttr_backup <- endhttr
endhttr <- URLencode(endhttr)
url <- glue::glue('{basehttr}{endhttr}')
# Get the information and convert to df
dt <- tryCatch( {
dt <- purrr::map(url, ~format_url_data(.))},
error = function( error_condition ) {
basehttr <- "https://qa.eds.gateway-api.willistowerswatson.com/gateway-api/Clients/Search?query="
endhttr <- 'APPLE'
endhttr_backup <- endhttr
endhttr <- URLencode(endhttr)
url <- glue::glue('{basehttr}{endhttr}')
dt <- purrr::map(url, ~format_url_data(.))
}
)
A <- dt %>% as.list.data.frame()
B <- ldply (A, data.frame)
Data <- B %>%
drop_na(name) %>% as.data.frame()
Data$Name <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", Data$Name)
When i run my code get an Authentication error
Could you help me to connect with Postman with that credentials?
Thanks

API Query for loop

I'm trying to pull some data from an API throw it all into a single data frame. I'm trying to put a variable into the URL I'm pulling from and then loop it to pull data from 54 keys. Here's what I have so far with notes.
library("jsonlite")
library("httr")
library("lubridate")
options(stringsAsFactors = FALSE)
url <- "http://api.kuroganehammer.com"
### This gets me a list of 58 observations, I want to use this list to
### pull data for each using an API
raw.characters <- GET(url = url, path = "api/characters")
## Convert the results from unicode to a JSON
text.raw.characters <- rawToChar(raw.characters$content)
## Convert the JSON into an R object. Check the class of the object after
## it's retrieved and reformat appropriately
characters <- fromJSON(text.raw.characters)
class(characters)
## This pulls data for an individual character. I want to get one of
## these for all 58 characters by looping this and replacing the 1 in the
## URL path for every number through 58.
raw.bayonetta <- GET(url = url, path = "api/characters/1/detailedmoves")
text.raw.bayonetta <- rawToChar(raw.bayonetta$content)
bayonetta <- fromJSON(text.raw.bayonetta)
## This is the function I tried to create, but I get a lexical error when
## I call it, and I have no idea how to loop it.
move.pull <- function(x) {
char.x <- x
raw.x <- GET(url = url, path = cat("api/characters/",char.x,"/detailedmoves", sep = ""))
text.raw.x <- rawToChar(raw.x$content)
char.moves.x <- fromJSON(text.raw.x)
char.moves.x$id <- x
return(char.moves.x)
}
The first part of this:
library(jsonlite)
library(httr)
library(lubridate)
library(tidyverse)
base_url <- "http://api.kuroganehammer.com"
res <- GET(url = base_url, path = "api/characters")
content(res, as="text", encoding="UTF-8") %>%
fromJSON(flatten=TRUE) %>%
as_tibble() -> chars
Gets you a data frame of the characters.
This:
pb <- progress_estimated(length(chars$id))
map_df(chars$id, ~{
pb$tick()$print()
Sys.sleep(sample(seq(0.5, 2.5, 0.5), 1)) # be kind to the free API
res <- GET(url = base_url, path = sprintf("api/characters/%s/detailedmoves", .x))
content(res, as="text", encoding="UTF-8") %>%
fromJSON(flatten=TRUE) %>%
as_tibble()
}, .id = "id") -> moves
Gets you a data frame of all the "moves" and adds the "id" for the character. You get a progress bar for free, too.
You can then either left_join() as needed or group & nest the moves data into a separate list-nest column. If you want that to begin with you can use map() vs map_df().
Leave in the time pause code. It's a free API and you should likely increase the pause times to avoid DoS'ing their site.

pageToken while scraping Youtube comments in R

I am trying to generate a dataset of comments from a Youtube video and am having trouble looping over the pageToken using the Google API. Below is a snippet of code. Why doesn't the 'while' loop work?
base_url <- "https://www.googleapis.com/youtube/v3/commentThreads/"
data = "list"
api_opts <- list(
part = "snippet",
maxResults = 100,
textFormat = "plainText",
videoId = "N708P-A45D0", # This is an example of a video id
key = "google developer key goes here",
fields = "items,nextPageToken",
orderBy = "published")
init_results <- httr::content(httr::GET(base_url, query = api_opts))
data <- init_results$items
api_opts$pageToken <- init_results$nextPageToken
api_opts$pageToken <- gsub("\\=", "", init_results$nextPageToken)
print(api_opts$pageToken)
while (api_opts$pageToken != "") {
print(api_opts$pageToken)
next_results <- httr::content(httr::GET(base_url, query = api_opts))
data <- c(data, next_results$items)
api_opts$pageToken <- gsub("\\=", "", next_results$nextPageToken)
}
organize_data = function(){
sub_data <- lapply(data, function(x) {
data.frame(
Comment = x$snippet$topLevelComment$snippet$textDisplay,
User = x$snippet$topLevelComment$snippet$authorDisplayName,
ReplyCount = x$snippet$totalReplyCount,
LikeCount = x$snippet$topLevelComment$snippet$likeCount,
PublishTime = x$snippet$topLevelComment$snippet$publishedAt,
CommentId = x$snippet$topLevelComment$id,
stringsAsFactors=FALSE)
})
}
sample <- organize_data()
L <- length(sample)
sample <- data.frame(matrix(unlist(sample), nrow=L, byrow=T))
colnames(sample) <- c("Comment", "User", "ReplyCount", "LikeCount", "PublishTime", "CommentId")
head(sample)
Thanks for looking, in case anyone else has this problem in the future, below is what I did to fix this problem. I still can't get the replies to the replies.
####
# NEW TRY
# Note: according to YouTube "YouTube currently supports replies only for top-level comments. However, replies to replies may be supported in the future."
####
rm(list=ls())
data = "list"
# Initialize
init_results <- httr::content(httr::GET("https://www.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&maxResults=100&textFormat=plainText&videoId=N708P-A45D0&fields=items%2CnextPageToken&key=[my google developer key]"))
data <- init_results$items
init_results$nextPageToken
print(init_results$nextPageToken)
# Begin loop
while (init_results$nextPageToken != ""){
# Make the page token URL encoded
api_opts_pageToken <- gsub("=", "%3D", init_results$nextPageToken)
# Write the call with the updated page token
get_call <- gsub("api_pageToken", api_opts_pageToken, "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&maxResults=100&pageToken=api_pageToken&textFormat=plainText&videoId=N708P-A45D0&fields=items%2CnextPageToken&key==[my google developer key]")
# Pull out the data from this page token call
next_results <- httr::content(httr::GET(get_call))
# Update the datafile
data <- c(data,next_results$items)
# Update the page token
print(next_results$nextPageToken)
init_results$nextPageToken <- next_results$nextPageToken
}
organize_data = function(){
sub_data <- lapply(data, function(x) {
data.frame(
Comment = x$snippet$topLevelComment$snippet$textDisplay,
User = x$snippet$topLevelComment$snippet$authorDisplayName,
ReplyCount = x$snippet$totalReplyCount,
LikeCount = x$snippet$topLevelComment$snippet$likeCount,
PublishTime = x$snippet$topLevelComment$snippet$publishedAt,
CommentId = x$snippet$topLevelComment$id,
stringsAsFactors=FALSE)
})
}
sample <- organize_data()
L <- length(sample)
sample <- data.frame(matrix(unlist(sample), nrow=L, byrow=T))
colnames(sample) <- c("Comment", "User", "ReplyCount", "LikeCount", "PublishTime", "CommentId")
head(sample)
dim(sample)

Resources