Related
This is my dataset: when I filter for Actors column, I get a list of list (of 4 actors per movie)
head(movies$Actors)
[[1]]
[1] "Rishab Shetty" " Sapthami Gowda" " Kishore Kumar G."
[4] " Achyuth Kumar"
[[2]]
[1] "Christian Bale" " Heath Ledger" " Aaron Eckhart" " Michael Caine"
[[3]]
[1] "Elijah Wood" " Viggo Mortensen" " Ian McKellen"
[4] " Orlando Bloom"
[[4]]
[1] "Leonardo DiCaprio" " Joseph Gordon-Levitt" " Elliot Page"
[4] " Ken Watanabe"
[[5]]
[1] "Elijah Wood" " Ian McKellen" " Viggo Mortensen"
[4] " Orlando Bloom"
[[6]]
[1] "Elijah Wood" " Ian McKellen" " Orlando Bloom" " Sean Bean"
Since there are 5000 rows, there are way too many actors to use for one hot encoding. What I tried to do is find the top 20 actors (using sort() and table() ),
and then to add a binary variable that states if a particular movie has any of the top e.g.20 actors in it, as this might be a simple proxy for whether the movie has good ratings.
Unfortunately, the code doesn't work. Can't seem to google my way out of this either. Can anyone help me?
## get 20 biggest actors in terms of number of movies
top20actorstable <- sort(table(actorlist), decreasing = T)[1:20]
names(top20actorstable)
## one hot encoding
top20actorsnames <- names(top20actorstable)
movies$bigactor <- NA
for (i in nrow(movies)){
listactors <- unlist(movies[i,]$Actors)
if (any(is.element(listactors, top20actorsnames))){
movies[i,]$bigactor <- 1
}
else {movies[i,]$bigactor <- 0}
}
Edit:
> dput(head(movies$Actors, 10))
list(c("Rishab Shetty", " Sapthami Gowda", " Kishore Kumar G.",
" Achyuth Kumar"), c("Christian Bale", " Heath Ledger", " Aaron Eckhart",
" Michael Caine"), c("Elijah Wood", " Viggo Mortensen", " Ian McKellen",
" Orlando Bloom"), c("Leonardo DiCaprio", " Joseph Gordon-Levitt",
" Elliot Page", " Ken Watanabe"), c("Elijah Wood", " Ian McKellen",
" Viggo Mortensen", " Orlando Bloom"), c("Elijah Wood", " Ian McKellen",
" Orlando Bloom", " Sean Bean"), c("Keanu Reeves", " Laurence Fishburne",
" Carrie-Anne Moss", " Hugo Weaving"), c("Mark Hamill", " Harrison Ford",
" Carrie Fisher", " Billy Dee Williams"), c("Arnold Schwarzenegger",
" Linda Hamilton", " Edward Furlong", " Robert Patrick"), c("Mark Hamill",
" Harrison Ford", " Carrie Fisher", " Alec Guinness"))
What I meant by "code doesn't work": I was hoping for the for loop to, one by one, check within the list of actors of each row, unlist them and check against the list of top20actors - if there is one of the top actors, then the bigactor column would be a 1, otherwise 0.
However, when I check the column after the for loop, it returns NA:
> for (i in nrow(movies)){
+ listactors <- unlist(movies[i,]$Actors)
+ if (any(is.element(listactors, top20actorsnames))){
+ movies[i,]$bigactor <- 1
+ }
+ else {movies[i,]$bigactor <- 0}
+ }
Warning: provided 11 variables to replace 10 variables
> movies$bigactor
NULL
Here is my approach. Make the list of actors of interest.
Then loop through the list (using sapply()) of movies and find the movies containing (%in%) the actors of interest. Return a vector of TRUE/FALSE for corresponding to matches.
movies <- list(c("Rishab Shetty", " Sapthami Gowda", " Kishore Kumar G.", "Achyuth Kumar"),
c("Christian Bale", " Heath Ledger", " Aaron Eckhart", " Michael Caine"),
c("Elijah Wood", " Viggo Mortensen", " Ian McKellen", " Orlando Bloom"),
c("Leonardo DiCaprio", " Joseph Gordon-Levitt", " Elliot Page", " Ken Watanabe"),
c("Elijah Wood", " Ian McKellen", " Viggo Mortensen", " Orlando Bloom"),
c("Elijah Wood", " Ian McKellen", " Orlando Bloom", " Sean Bean"),
c("Keanu Reeves", " Laurence Fishburne", " Carrie-Anne Moss", " Hugo Weaving"),
c("Mark Hamill", " Harrison Ford", " Carrie Fisher", " Billy Dee Williams"),
c("Arnold Schwarzenegger", " Linda Hamilton", " Edward Furlong", " Robert Patrick"),
c("Mark Hamill", " Harrison Ford", " Carrie Fisher", " Alec Guinness"))
#create actors list
#adding trimws to remove leading and trailing spaces
actorlist <- unlist(movies) |> trimws()
#shortened down to 7 for debugging
top20actorstable <- sort(table(actorlist), decreasing = T)[1:7] |> names()
#loop through the list looking for matching actors
#returns a vector of true/false for the matches
bigactor <- sapply(movies, function(movie) {
any(trimws(movie) %in% top20actorstable)
})
bigactor
as.integer(bigactor)
Since the data sample you provided is a list, I am not sure where the final results are stored. You could try to store your list of vectors in a data frame but that is complicated and very helpful.
I've tried to read the text of this article, however I obtain just character(0)
library(rvest)
tex <- read_html("http://semanaeconomica.com/article/sectores-y-empresas/transporte/360660-renegociar-si-anular-no/")
p_text <- tex %>%
html_nodes("section") %>%
html_nodes("#text") %>%
html_text()%>%print()
I'm not an expert in web scraping, so I will be very grateful by your help!
I have been able to obtain the text in the page using the following code :
library(RDCOMClient)
url <- "http://semanaeconomica.com/article/sectores-y-empresas/transporte/360660-renegociar-si-anular-no/"
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)
Sys.sleep(5)
doc <- IEApp$Document()
web_Obj <- doc$querySelector("body > div.container.se-container.se-container--sm.mb60 > div > div.se-article__body.fixed-to-pos.pl55-md.mb60 > div")
txt <- web_Obj$innerText()
txt <- strsplit(txt, "\n|\r")[[1]]
txt <- txt[txt != ""]
txt
[1] "Como una veleta que se mueve según los vientos de la indignación ciudadana, el alcalde Jorge Muñoz anunció que el Concejo Metropolitano evaluará la anulación de los contratos de Rutas de Lim... "
[2] "¿QUIERE LEER LA HISTORIA COMPLETA?"
[3] "Regístrese y obtenga 3 artículos gratis al mes y el boletín informativo."
[4] "Suscríbase para acceso ilimitado"
[5] " "
[6] " DNI "
[7] " Carnet de extranjería "
[8] " "
[9] " "
[10] " "
[11] " "
[12] " "
[13] " "
[14] "Se requiere al menos 8 caracteres, una mayúscula, una minúscula y un número"
[15] " "
[16] " Acepto los términos y condiciones y las políticas de privacidad "
[17] "Regístrese y continúe leyendo "
[18] "¿Ya tiene una cuenta? Inicie sesión "
[19] " grecaptcha.ready(function() { grecaptcha.execute('6LfO_LAZAAAAANQMr4R1KnhUFziP2QJsCQqUCHXR', {action: 'submit'}).then(function(token) { if (token) { document.getElementById('recaptcha').value = token; } }); }); "
I have been looking around for few hours now and have not been able not remove "" from the character of strings below.
c("Final", "A", "7.43", "8.50", "15.93", "2.00",
"1.00", "0.30", "0.37", " 7.43", " 8.50", "0.50", "0.67", " ",
" ", " ", " ", " ", " ", " ", "B", "7.00", "3.77", "10.77",
" 7.00", "1.67", "3.77", " ", " ", " ", " ", " ", " ", " ", " ",
I have many more of these empty values in this dataset and just want to get rid of them before organizing then as a data frame like
Final
A B
7.43 7.43
8.50 8.50
15.93 0.50
2.00 0.67
1.00
0.30
Thanks,
You can use the base grep with values = TRUE. That searches the character vector for a given regex pattern and returns all values where that pattern is found.
You can think about the logic of your pattern a couple ways. One might be to think of it as keeping values with a "word" character, which are letters, numbers, or underscores.
x <- c("Final", "A", "7.43", "8.50", "15.93", "2.00", "1.00", "0.30", "0.37", " 7.43", " 8.50", "0.50", "0.67", " ", " ", " ", " ", " ", " ", " ", "B", "7.00", "3.77", "10.77", " 7.00", "1.67", "3.77", " ", " ", " ", " ", " ", " ", " ", " ")
grep("\\w", x, value = T)
#> [1] "Final" "A" "7.43" "8.50" "15.93" "2.00" "1.00" "0.30"
#> [9] "0.37" " 7.43" " 8.50" "0.50" "0.67" "B" "7.00" "3.77"
#> [17] "10.77" " 7.00" "1.67" "3.77"
Another way is to find values with a character that isn't a space (\\S is the negation of \\s):
grep("\\S", x, value = T)
#> [1] "Final" "A" "7.43" "8.50" "15.93" "2.00" "1.00" "0.30"
#> [9] "0.37" " 7.43" " 8.50" "0.50" "0.67" "B" "7.00" "3.77"
#> [17] "10.77" " 7.00" "1.67" "3.77"
Created on 2018-12-10 by the reprex package (v0.2.1)
I've tried:
i <- as.numeric(as.character(Impress))
i <- as.numeric(as.character(levels(Impress)))
i <- as.numeric(paste(Impress))
I always get:
Warning message:
NAs introduced by coercion
> i
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
This is the data I want to be numeric:
> Impress
[1] 24,085,563.00 35,962,587.00 31,714,513.00 28,206,422.00 40,161,010.00 36,292,929.00 31,545,482.00
[8] 28,213,878.00 35,799,224.00 32,400,885.00 28,496,459.00 37,456,344.00 38,108,667.00 33,407,771.00
[15] 32,540,479.00 30,692,707.00 22,873,000.00 21,329,146.00 28,921,953.00 30,471,519.00 28,601,289.00
[22] 27,450,630.00 26,708,790.00 19,825,041.00 18,844,169.00 29,592,039.00 31,012,594.00 28,792,531.00
[29] 28,578,028.00 24,913,985.00
30 Levels: 18,844,169.00 19,825,041.00 21,329,146.00 22,873,000.00 24,085,563.00 24,913,985.00 ... 40,161,010.00
> paste(Impress)
[1] " 24,085,563.00 " " 35,962,587.00 " " 31,714,513.00 " " 28,206,422.00 " " 40,161,010.00 " " 36,292,929.00 " " 31,545,482.00 "
[8] " 28,213,878.00 " " 35,799,224.00 " " 32,400,885.00 " " 28,496,459.00 " " 37,456,344.00 " " 38,108,667.00 " " 33,407,771.00 "
[15] " 32,540,479.00 " " 30,692,707.00 " " 22,873,000.00 " " 21,329,146.00 " " 28,921,953.00 " " 30,471,519.00 " " 28,601,289.00 "
[22] " 27,450,630.00 " " 26,708,790.00 " " 19,825,041.00 " " 18,844,169.00 " " 29,592,039.00 " " 31,012,594.00 " " 28,792,531.00 "
[29] " 28,578,028.00 " " 24,913,985.00 "
and when I do i<-as.numeric(Impress), it pastes the wrong values.
Thanks!
As far as the computer is concerned, , is not a number and hence any number string containing it must not be numeric, even if to a human these look like perfectly acceptable numbers.
Get rid of the , and then it will work, e.g. using gsub()
i <- as.numeric(gsub(",", "", as.character(Impress)))
E.g.
Impress <- c("24,085,563.00", "35,962,587.00", "31,714,513.00", "28,206,422.00")
gsub(",", "", as.character(Impress))
i <- as.numeric(gsub(",", "", as.character(Impress)))
i
R> gsub(",", "", as.character(Impress))
[1] "24085563.00" "35962587.00" "31714513.00" "28206422.00"
R> i
[1] 24085563 35962587 31714513 28206422
R> is.numeric(i)
[1] TRUE
Because the data has commas, R cannot convert it to a numeric. You have to remove the commas with sub() first and then convert:
i <- as.numeric(gsub(",", "", as.character(impress)))
I've tried:
i <- as.numeric(as.character(Impress))
i <- as.numeric(as.character(levels(Impress)))
i <- as.numeric(paste(Impress))
I always get:
Warning message:
NAs introduced by coercion
> i
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
This is the data I want to be numeric:
> Impress
[1] 24,085,563.00 35,962,587.00 31,714,513.00 28,206,422.00 40,161,010.00 36,292,929.00 31,545,482.00
[8] 28,213,878.00 35,799,224.00 32,400,885.00 28,496,459.00 37,456,344.00 38,108,667.00 33,407,771.00
[15] 32,540,479.00 30,692,707.00 22,873,000.00 21,329,146.00 28,921,953.00 30,471,519.00 28,601,289.00
[22] 27,450,630.00 26,708,790.00 19,825,041.00 18,844,169.00 29,592,039.00 31,012,594.00 28,792,531.00
[29] 28,578,028.00 24,913,985.00
30 Levels: 18,844,169.00 19,825,041.00 21,329,146.00 22,873,000.00 24,085,563.00 24,913,985.00 ... 40,161,010.00
> paste(Impress)
[1] " 24,085,563.00 " " 35,962,587.00 " " 31,714,513.00 " " 28,206,422.00 " " 40,161,010.00 " " 36,292,929.00 " " 31,545,482.00 "
[8] " 28,213,878.00 " " 35,799,224.00 " " 32,400,885.00 " " 28,496,459.00 " " 37,456,344.00 " " 38,108,667.00 " " 33,407,771.00 "
[15] " 32,540,479.00 " " 30,692,707.00 " " 22,873,000.00 " " 21,329,146.00 " " 28,921,953.00 " " 30,471,519.00 " " 28,601,289.00 "
[22] " 27,450,630.00 " " 26,708,790.00 " " 19,825,041.00 " " 18,844,169.00 " " 29,592,039.00 " " 31,012,594.00 " " 28,792,531.00 "
[29] " 28,578,028.00 " " 24,913,985.00 "
and when I do i<-as.numeric(Impress), it pastes the wrong values.
Thanks!
As far as the computer is concerned, , is not a number and hence any number string containing it must not be numeric, even if to a human these look like perfectly acceptable numbers.
Get rid of the , and then it will work, e.g. using gsub()
i <- as.numeric(gsub(",", "", as.character(Impress)))
E.g.
Impress <- c("24,085,563.00", "35,962,587.00", "31,714,513.00", "28,206,422.00")
gsub(",", "", as.character(Impress))
i <- as.numeric(gsub(",", "", as.character(Impress)))
i
R> gsub(",", "", as.character(Impress))
[1] "24085563.00" "35962587.00" "31714513.00" "28206422.00"
R> i
[1] 24085563 35962587 31714513 28206422
R> is.numeric(i)
[1] TRUE
Because the data has commas, R cannot convert it to a numeric. You have to remove the commas with sub() first and then convert:
i <- as.numeric(gsub(",", "", as.character(impress)))