How to select variables from dataframe inside other recursive dataframe in r - r

I have the following dataframe:
str(data_raw)
'data.frame': 20 obs. of 18 variables:
$ id : chr "2306173214621953247_176548637" "2304792897512042631_176548637" "2298783867773662543_176548637" "2249480367030200759_176548637" ...
$ user :'data.frame': 20 obs. of 4 variables:
..$ id : chr "176548637" "176548637" "176548637" "176548637" ...
..$ full_name : chr "Carlos Costa" "Carlos Costa" "Carlos Costa" "Carlos Costa" ...
But when I try to get the user$id variable, it shows the error:
data_raw<- data_raw %>%
select(id,user.id)
Error: Can't subset columns that don't exist.
x The column user.id doesn't exist.
I also tried this way:
data_raw<- data_raw %>%
+ select(id,user$id)
Error: object 'user' not found
I know that the user variable is a dataframe, but how do I get information from a dataframe within another dataframe?

I simplified the structure to help. The dataframe has only 1 line.
dput(head(data_raw, 2))
structure(list(id = "2306173214621953247_176548637", user = structure(list(
id = "176548637", full_name = "Carlos Costa", profile_picture = "link.com",
username = "carlosocosta"), row.names = 1L, class = "data.frame"),
images = structure(list(thumbnail = structure(list(width = 150L, height = 150L, url = "link.com"), row.names = 1L, class = "data.frame"),
low_resolution = structure(list(width = 320L, height = 320L,
url = "link.com"), row.names = 1L, class = "data.frame"),
standard_resolution = structure(list(width = 640L, height = 640L,
url = "link.com"), row.names = 1L, class = "data.frame")), row.names = 1L, class = "data.frame"),
created_time = "1589137292", caption = structure(list(id = "18105905566138276",
text = "Não basta ser mãe! Tem que aprender a jogar Fortnite! Feliz dia das mães! #maedemenino",
created_time = "1589137292", from = structure(list(id = "176548637",
full_name = "Carlos Costa", profile_picture = "link.com",
username = "carlosocosta"), row.names = 1L, class = "data.frame")), row.names = 1L, class = "data.frame"),
user_has_liked = FALSE, likes = structure(list(count = 69L), row.names = 1L, class = "data.frame"),
tags = list("maedemenino"), filter = "Normal", comments = structure(list(
count = 3L), row.names = 1L, class = "data.frame"), type = "image",
link = "link.com", location = structure(list(
latitude = NA_real_, longitude = NA_real_, name = NA_character_,
id = NA_integer_), row.names = 1L, class = "data.frame"),
attribution = NA, users_in_photo = list(structure(list(user = structure(list(
username = "vivicosta_oficial"), class = "data.frame", row.names = 1L),
position = structure(list(x = 0.2210144928, y = 0.5857487923), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L)),
carousel_media = list(NULL), videos = structure(list(standard_resolution = structure(list(
width = NA_integer_, height = NA_integer_, url = NA_character_,
id = NA_character_), row.names = 1L, class = "data.frame"),
low_resolution = structure(list(width = NA_integer_,
height = NA_integer_, url = NA_character_, id = NA_character_), row.names = 1L, class = "data.frame"),
low_bandwidth = structure(list(width = NA_integer_, height = NA_integer_,
url = NA_character_, id = NA_character_), row.names = 1L, class = "data.frame")), row.names = 1L, class = "data.frame"),
video_views = NA_integer_), row.names = 1L, class = "data.frame")

Since we have a dataframe inside a dataframe, make it a single dataframe using do.call +cbind and then subset the columns needed.
do.call(cbind, data_raw)[c('id', 'user.id')]
# id user.id
#1 2306173214621953247_176548637 176548637
#2 2304792897512042631_176548637 176548637
Or with dplyr::select
library(dplyr)
do.call(cbind, data_raw) %>% select(id, user.id)
data
Tested on this data :
data_raw <- data.frame(id = c('2306173214621953247_176548637',
'2304792897512042631_176548637'))
user <- data.frame(id = c('176548637', '176548637'), full_name = c('a', 'b'))
data_raw$user <- user
str(data_raw)
#'data.frame': 2 obs. of 2 variables:
# $ id : chr "2306173214621953247_176548637" "2304792897512042631_176548637"
# $ user:'data.frame': 2 obs. of 2 variables:
# ..$ id : chr "176548637" "176548637"
# ..$ full_name: chr "a" "b"

Related

R - Convert List with NULL types into dataframe

I have the below list:
test <- list(author = list(name = "Yihui Xie", email = "BLINDED",
date = "2021-04-14T15:26:29Z"), committer = list(name = "Yihui Xie",
email = "xie#yihui.name", date = "2021-04-14T15:27:13Z"),
message = "start the next version", tree = list(sha = "f4032fb23c2e3c6005a76f4e117f6489d321c721",
url = "https://api.github.com/repos/rstudio/blogdown/git/trees/f4032fb23c2e3c6005a76f4e117f6489d321c721"),
url = "https://api.github.com/repos/rstudio/blogdown/git/commits/5aeb809c68cfa1a9e616bc9ed9878c3ea5d05300",
comment_count = 0L, verification = list(verified = FALSE,
reason = "unsigned", signature = NULL, payload = NULL))
I need to convert this into a dataframe, the problem is that I have two elements that are NULL. I need to keep the columns, and switch the content to NA or to an empty string. I am batch processing, so I may have other values null.
A regular change doesn't work:
> as.data.frame(test)
Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, :
arguments imply differing number of rows: 1, 0
Expected Output:
The below is another list that I was able to transform properly. As you can see, this one doesn't have missing values, but I am adding it here so you can see what I need (this is a dput of the transformation):
structure(list(author.name = structure(1L, .Label = "Christophe Dervieux", class = "factor"),
author.email = structure(1L, .Label = "BLINDED", class = "factor"),
author.date = structure(1L, .Label = "2021-05-26T16:19:44Z", class = "factor"),
committer.name = structure(1L, .Label = "GitHub", class = "factor"),
committer.email = structure(1L, .Label = "noreply#github.com", class = "factor"),
committer.date = structure(1L, .Label = "2021-05-26T16:19:44Z", class = "factor"),
message = structure(1L, .Label = "clean_duplicates() is now aware of blogdown rendering method (#629)", class = "factor"),
tree.sha = structure(1L, .Label = "f1d056b93ce0d060501d5fd6b9e9df2d934059f6", class = "factor"),
tree.url = structure(1L, .Label = "https://api.github.com/repos/rstudio/blogdown/git/trees/f1d056b93ce0d060501d5fd6b9e9df2d934059f6", class = "factor"),
url = structure(1L, .Label = "https://api.github.com/repos/rstudio/blogdown/git/commits/00a20903f0b2953f8f350d69bffdcd9c50cda5b1", class = "factor"),
comment_count = 0L, verification.verified = TRUE, verification.reason = structure(1L, .Label = "valid", class = "factor"),
verification.signature = structure(1L, .Label = "-----BEGIN PGP SIGNATURE-----\n\nwsBcBAABCAAQBQJgrnUgCRBK7hj4Ov3rIwAAJNMIAD1/pWaW/NYsefSLx5tvcTyl\nfG+Nst5dxAYz1jvZBsiy/zGsrk42EneA391svg6SkW8brf37tNUq3Ob1fXxrknCB\nDctR6X1v281KS9ziFOXMC67HKeqSqWqFD/QaQ3Q2+TDUSdV2Gos6TN6asaBfcwku\nwadow9ZOnzi6tvT7KqWeFD05M8cHnPpTrbPJ8BUjkuf5mQog0xJY40Sev9DFg33P\nux6jhBKJZeN72UxK1K9zs/OvHOLerHoq/pt+mxFnmsf/Kgps2/WX8sE2BLsU6zPg\nePZMyTfLulDXdhoMK6vU6Lj5faiWbLk/xE9zaBKGiRqKALtBsR75YnTal5Gb/qM=\n=bVRa\n-----END PGP SIGNATURE-----\n", class = "factor"),
verification.payload = structure(1L, .Label = "tree f1d056b93ce0d060501d5fd6b9e9df2d934059f6\nparent 20a8258b39f5cbda7911cc8c0cdb35a4bb31aa52\nauthor Christophe Dervieux <cderv#rstudio.com> 1622045984 +0200\ncommitter GitHub <noreply#github.com> 1622045984 +0200\n\nclean_duplicates() is now aware of blogdown rendering method (#629)\n\n", class = "factor")), class = "data.frame", row.names = c(NA,
-1L))
We could replace the NULL values to NA recursively and then change the flattened data to data.frame
library(rrapply)
library(dplyr)
out2 <- rrapply(test, f = function(x) replace(x, is.null(x), NA),
how = 'flatten') %>%
as.data.frame.list %>%
type.convert(as.is = TRUE)
checking the dimensions
dim(out2)
#[1] 1 15

Remove empty osmar objects from the list of osmar objects

i have the folowing list nodes_found with 2 elements in this example but the number of elemets will vary in the future:
library(osmar)
>nodes_found
$`1`
osmar object
0 nodes, 0 ways, 0 relations
$`2`
osmar object
1 nodes, 0 ways, 0 relations
> class(nodes_found)
[1] "list"
i would like to remove empty osmar objects (objects with 0 nodes in this case the object at [1]) from the list.
Maybe filter based on the atributes of a node? if i basicaly request the atributes of an osmar object that contains 0 nodes it will produce a data structure with 0 rows.
So maybe i could use that.
But i dont think its efficient.
here is the nodes_found list:
nodes_found<-list(`1` = structure(list(nodes = structure(list(attrs = structure(list(
id = numeric(0), visible = character(0), timestamp = structure(list(
sec = numeric(0), min = integer(0), hour = integer(0),
mday = integer(0), mon = integer(0), year = integer(0),
wday = integer(0), yday = integer(0), isdst = integer(0),
zone = character(0), gmtoff = integer(0)), class = c("POSIXlt",
"POSIXt")), version = numeric(0), changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(integer(0), .Label = character(0), class = "factor"),
lat = numeric(0), lon = numeric(0)), row.names = integer(0), class = "data.frame"),
tags = structure(list(id = numeric(0), k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame")), class = c("nodes",
"osmar_element", "list")), ways = structure(list(attrs = structure(list(
id = numeric(0), visible = character(0), timestamp = structure(list(
sec = numeric(0), min = integer(0), hour = integer(0),
mday = integer(0), mon = integer(0), year = integer(0),
wday = integer(0), yday = integer(0), isdst = integer(0),
zone = character(0), gmtoff = integer(0)), class = c("POSIXlt",
"POSIXt")), version = numeric(0), changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
tags = structure(list(id = numeric(0), k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
refs = structure(list(id = numeric(0), ref = numeric(0)), row.names = integer(0), class = "data.frame")), class = c("ways",
"osmar_element", "list")), relations = structure(list(attrs = structure(list(
id = numeric(0), visible = character(0), timestamp = structure(list(
sec = numeric(0), min = integer(0), hour = integer(0),
mday = integer(0), mon = integer(0), year = integer(0),
wday = integer(0), yday = integer(0), isdst = integer(0),
zone = character(0), gmtoff = integer(0)), class = c("POSIXlt",
"POSIXt")), version = numeric(0), changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
tags = structure(list(id = numeric(0), k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
refs = structure(list(id = numeric(0), type = structure(integer(0), .Label = character(0), class = "factor"),
ref = numeric(0), role = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame")), class = c("relations",
"osmar_element", "list"))), class = c("osmar", "list")), `2` = structure(list(
nodes = structure(list(attrs = structure(list(id = 7018492265,
visible = NA_character_, timestamp = structure(list(sec = 42,
min = 7L, hour = 17L, mday = 5L, mon = 2L, year = 121L,
wday = 5L, yday = 63L, isdst = 0L, zone = "UTC",
gmtoff = NA_integer_), class = c("POSIXlt", "POSIXt"
)), version = 1, changeset = NA_real_, user = structure(NA_integer_, .Label = character(0), class = "factor"),
uid = structure(NA_integer_, .Label = character(0), class = "factor"),
lat = 48.1102703, lon = 11.8303853), row.names = 4281593L, class = "data.frame"),
tags = structure(list(id = numeric(0), k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame")), class = c("nodes",
"osmar_element", "list")), ways = structure(list(attrs = structure(list(
id = numeric(0), visible = character(0), timestamp = structure(list(
sec = numeric(0), min = integer(0), hour = integer(0),
mday = integer(0), mon = integer(0), year = integer(0),
wday = integer(0), yday = integer(0), isdst = integer(0),
zone = character(0), gmtoff = integer(0)), class = c("POSIXlt",
"POSIXt")), version = numeric(0), changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
tags = structure(list(id = numeric(0), k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
refs = structure(list(id = numeric(0), ref = numeric(0)), row.names = integer(0), class = "data.frame")), class = c("ways",
"osmar_element", "list")), relations = structure(list(attrs = structure(list(
id = numeric(0), visible = character(0), timestamp = structure(list(
sec = numeric(0), min = integer(0), hour = integer(0),
mday = integer(0), mon = integer(0), year = integer(0),
wday = integer(0), yday = integer(0), isdst = integer(0),
zone = character(0), gmtoff = integer(0)), class = c("POSIXlt",
"POSIXt")), version = numeric(0), changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
tags = structure(list(id = numeric(0), k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame"),
refs = structure(list(id = numeric(0), type = structure(integer(0), .Label = character(0), class = "factor"),
ref = numeric(0), role = structure(integer(0), .Label = character(0), class = "factor")), row.names = integer(0), class = "data.frame")), class = c("relations",
"osmar_element", "list"))), class = c("osmar", "list")))
You can use Filter to keep only those objects that have more than 0 rows in attrs.
Filter(function(x) nrow(x$nodes$attrs) > 0, nodes_found)
#$`2`
#osmar object
#1 nodes, 0 ways, 0 relations
We can use keep from purrr
purrr::keep(nodes_found, ~ nrow(.x$nodes$attrs) > 0)

Unlist Function Is Turning Numeric Values Into Characters. R

The code is scraping a website for stock data and returns a 1x18 dataframe for each stock. I am trying to convert the dataframe into a vector without turning the numeric columns into factors which is what is happening. I have also attempted to try and turn the dataframe into a matrix, but the numeric columns are still being converted into factors. In conclusion, I would like to keep characters as characters and numeric as numeric all in a vector. Thank you.
#get.dates is a function I created to scrape
data = get.dates("AAPL")
class(data)
[1] "data.frame"
class(data$surprise)
[1] "numeric"
dput(data)
structure(list(date = "2019-05-07T00:00:00", company = "Apple",
ticker = "AAPL", periodEnding = "Mar 2019", eps = "2.37",
reportedEPS = NA_character_, lastEps = "2.73", consensus = 4L,
bpConsensus = 4L, ratingsAndPT = structure(list(priceTarget = 177.34,
numBuys = 17L, numHolds = 18L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 176.88,
numBuys = 14L, numHolds = 14L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 827573630900, sector = 18731L, stockId = 7624L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L))
data = unlist(data)
class(data)
[1] "character"
So the final output is to rbind each of the outputs into a single data.frame.
I think I have to convert each 1x18 dataframe into a vector to rbind because I am getting an error when trying to rbind the columns using the foreach package.
tickers = c("AAPL", "PEP", "KO")
system.time({
data = foreach(r = tickers, .packages = c("jsonlite", "dplyr"), .combine = rbind) %dopar% {get.dates(r)}
})
error calling combine function:
<simpleError in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed>
user system elapsed
0.02 0.00 0.56
Warning message:
non-unique value when setting 'row.names': ‘1’
print(data)
NULL
#I will do the same thing outside of the foreach loop to give some more context
data = lapply(tickers, get.dates)
do.call(rbind, data)
Error in `.rowNamesDF<-`(x, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique value when setting 'row.names': ‘1’
dput(data)
list(structure(list(date = "2019-05-07T00:00:00", company = "Apple",
ticker = "AAPL", periodEnding = "Mar 2019", eps = "2.37",
reportedEPS = NA_character_, lastEps = "2.73", consensus = 4L,
bpConsensus = 4L, ratingsAndPT = structure(list(priceTarget = 177.34,
numBuys = 17L, numHolds = 18L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 176.88,
numBuys = 14L, numHolds = 14L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 827573630900, sector = 18731L, stockId = 7624L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = "2019-04-23T00:00:00", company = "Coca-Cola",
ticker = "KO", periodEnding = "Mar 2019", eps = "0.46", reportedEPS = NA_character_,
lastEps = "0.47", consensus = 4L, bpConsensus = 5L, ratingsAndPT = structure(list(
priceTarget = 50.89, numBuys = 4L, numHolds = 5L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 51.25,
numBuys = 3L, numHolds = 1L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 193681840000, sector = 18731L, stockId = 8359L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = "2019-04-25T00:00:00", company = "PepsiCo",
ticker = "PEP", periodEnding = "Mar 2019", eps = "0.92",
reportedEPS = NA_character_, lastEps = "0.96", consensus = 4L,
bpConsensus = 4L, ratingsAndPT = structure(list(priceTarget = 123.67,
numBuys = 4L, numHolds = 3L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 126,
numBuys = 1L, numHolds = 1L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 163697620000, sector = 18731L, stockId = 10962L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L)))
Here is what I would like the output to look like. Thank you!!
You basically have to do your own list flattening here, which is not desirable. It's easier to do this when you get the json data originally. https://rdrr.io/cran/jsonlite/man/flatten.html
The below solution users purrr but you can do it with a for-loop or apply functions if you prefer. There are two main ideas here:
1. Bind together the dataframe-type columns with the part of the dataframe that doesn't have any nested columns. In your example, we bind together 3 separate pieces: 1 original dataframe with df_cols removed, and the other two dataframe columns. You can do this with bind_cols. It helps to prepend the original column names to avoid duplicates.
2. Collapse all the rows together with rbind or the like.
flatten_df_cols <- function(df) {
df_cols <- map_lgl(df, is.data.frame)
imap_dfc(df[, df_cols], ~setNames(.x, paste0(.y, ".", names(.x)))) %>%
bind_cols(list(df[, !df_cols]), .)
}
map_dfr(data, flatten_df_cols)
Observations: 3
Variables: 24
$ date <chr> "2019-05-07T00:00:00", "2019-04...
$ company <chr> "Apple", "Coca-Cola", "PepsiCo"
$ ticker <chr> "AAPL", "KO", "PEP"
$ periodEnding <chr> "Mar 2019", "Mar 2019", "Mar 2019"
$ eps <chr> "2.37", "0.46", "0.92"
$ reportedEPS <chr> NA, NA, NA
$ lastEps <chr> "2.73", "0.47", "0.96"
$ consensus <int> 4, 4, 4
$ bpConsensus <int> 4, 5, 4
$ marketCap <dbl> 827573630900, 193681840000, 163...
$ sector <int> 18731, 18731, 18731
$ stockId <int> 7624, 8359, 10962
$ stockTypeId <int> 1, 1, 1
$ surprise <dbl> NA, NA, NA
$ timeOfDay <int> 4, 4, 4
$ isConfirmed <lgl> FALSE, FALSE, FALSE
$ ratingsAndPT.priceTarget <dbl> 177.34, 50.89, 123.67
$ ratingsAndPT.numBuys <int> 17, 4, 4
$ ratingsAndPT.numHolds <int> 18, 5, 3
$ ratingsAndPT.numSells <int> 0, 0, 0
$ bpRatingsAndPT.priceTarget <dbl> 176.88, 51.25, 126.00
$ bpRatingsAndPT.numBuys <int> 14, 3, 1
$ bpRatingsAndPT.numHolds <int> 14, 1, 1
$ bpRatingsAndPT.numSells <int> 0, 0, 0

Create logical list with strsplit on combined words to subset data frame

I have tried to subset my data frame according a condition on specific column. For this purpose I need to create TRUE or FALSE info for each line on this column. But some line on this column has combine words and my code can not detect them.
p <- sapply(strsplit(test$hashtags, split=","), function(x)any(x%in%"evet"))
When you check the sample data you can easily see that line 5,7,8 have specific word but they are showed as a FALSE.
I have tried to add "unlist" command in my code but it haven't worked for me.
p <- sapply(unlist(strsplit(test$hashtags, split=",")), function(x)any(x%in%"evet"))
I need to create one FALSE or TRUE condition according specific word for combined line even though there are more than one words.
Thanks for in advance.
Sample Data:
test <- structure(list(created_at = structure(c(1489636860, 1489636860,
1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 1489636860,
1489636860, 1489636860), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
user.screen_name = c("bilge_bilir", "memetozturk93", "Byomeraslan",
"tmremolar", "orhanyilmaz_77", "tamdere", "EriVatan", "BaySancaktar",
"zeynepmekik", "EriVatan"), entities.hashtags = list(structure(list(
indices = list(c(84L, 90L)), text = "Hayır"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(65L, 70L)), text = "evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(98L, 103L)), text = "Evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(98L, 104L)), text = "Hayır"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(28L, 33L), c(45L, 50L), c(89L, 94L)),
text = c("EVET", "EVET", "EVET")), .Names = c("indices",
"text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
indices = list(c(38L, 43L)), text = "EVET"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(20L, 29L), c(36L, 46L), c(89L, 94L)),
text = c("Dirilişe", "Yükselişe", "Evet")), .Names = c("indices",
"text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
indices = list(c(10L, 15L), c(16L, 20L), c(21L, 26L),
c(27L, 31L)), text = c("Evet", "Eri", "Beli", "Yes"
)), .Names = c("indices", "text"), class = "data.frame", row.names = c(NA,
4L)), structure(list(indices = list(c(125L, 130L)), text = "Evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(102L, 107L)), text = "EVET"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L)), retweeted_status.created_at = c("Thu Mar 16 03:49:15 +0000 2017",
"Wed Mar 15 23:57:44 +0000 2017", "Wed Mar 15 21:07:54 +0000 2017",
"Wed Mar 15 20:54:43 +0000 2017", "Wed Mar 15 14:41:15 +0000 2017",
"Wed Mar 15 23:07:43 +0000 2017", "Wed Mar 15 15:41:06 +0000 2017",
NA, "Wed Mar 15 11:13:15 +0000 2017", "Wed Mar 15 16:37:13 +0000 2017"
), entities.user_mentions = list(structure(list(indices = list(
c(3L, 16L), c(18L, 30L), c(44L, 55L), c(56L, 71L), c(72L,
83L)), screen_name = c("seremgiz8289", "bilge_bilir",
"OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), id = c(301944248,
2189106581, 2756465282, 2668851081, 2734161237), id_str = c("301944248",
"2189106581", "2756465282", "2668851081", "2734161237"),
name = c("ATA KIZI HAYIR DİYOR", "Bilge Eryuz", "OduncuTimi ®",
"Yalçın Velioğlu", "OPTlMlst_Z")), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = c(NA,
5L)), structure(list(indices = list(c(3L, 16L)), screen_name = "kendimce_ben",
id = 2322523731, id_str = "2322523731", name = "İzzet#EVET/\U0001f1f9\U0001f1f7"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 12L)), screen_name = "omrolcay",
id = 360420809L, id_str = "360420809", name = "Ömer Olcay"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "mehmet_asassoy",
id = 3151503430, id_str = "3151503430", name = "Mehmet Asassoy"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 17L), c(120L, 132L
)), screen_name = c("sevincbeykent", "yigitbulutt"),
id = c(538364458L, 256065299L), id_str = c("538364458",
"256065299"), name = c("Sevinç", "YİĞİT BULUT"
)), .Names = c("indices", "screen_name", "id", "id_str",
"name"), class = "data.frame", row.names = 1:2), structure(list(
indices = list(c(3L, 13L)), screen_name = "AKsamet54",
id = 313205928L, id_str = "313205928", name = "Samet ÇELİK"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "HayataTebessum",
id = 2911157237, id_str = "2911157237", name = "Meryem"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(0L, 9L)), screen_name = "4qet1dil",
id = 536676261L, id_str = "536676261", name = "KerenGo"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "akkadinantalya",
id = 1898504755L, id_str = "1898504755", name = "AK Kadın Antalya"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 15L)), screen_name = "menes__2010",
id = 186968367L, id_str = "186968367", name = "#EVET☪ ياسين ☝"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L)),
hashtags = c("hayir", "evet", "evet", "hayir", "c(\"evet\", \"evet\", \"evet\")",
"evet", "c(\"dirilise\", \"yukselise\", \"evet\")", "c(\"evet\", \"eri\", \"beli\", \"yes\")",
"evet", "evet"), mentions = list(c("seremgiz8289", "bilge_bilir",
"OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), "kendimce_ben",
"omrolcay", "mehmet_asassoy", c("sevincbeykent", "yigitbulutt"
), "AKsamet54", "HayataTebessum", "4qet1dil", "akkadinantalya",
"menes__2010")), .Names = c("created_at", "user.screen_name",
"entities.hashtags", "retweeted_status.created_at", "entities.user_mentions",
"hashtags", "mentions"), row.names = c(NA, 10L), class = "data.frame")
That is mostly because the way hashtags column was generated. It was stored as a list of character vector and when coerced to character it gave this structure.
See for example,
list(c("A", "B", "C"))
#[[1]]
#[1] "A" "B" "C"
as.character(list(c("A", "B", "C")))
#[1] "c(\"A\", \"B\", \"C\")"
Checking an individual element on your dataframe gives the same structure.
test$hashtags[5]
#[1] "c(\"evet\", \"evet\", \"evet\")"
So if there is no way you could go back and change the way hashtags columns was generated you can use grepl instead and it would save you from strsplit and sapply call as well.
grepl("evet", test$hashtags)
#[1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
I would use grepl here:
p <- sapply(strsplit(test$hashtags, split=","), function(x) {
grepl("evet", x)
})
If you really wanted to match the standalone word evet, then use word boundaries:
p <- sapply(strsplit(test$hashtags, split=","), function(x) {
grepl("\bevet\b", x)
})
We can create a logical index column with str_detect
library(tidyverse)
out <- test %>%
mutate(ind = str_detect(hashtags, pattern = "evet"))
out$ind
#[1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
If we need to get the logical index for each word
test %>%
mutate(ind = str_extract_all(hashtags, "\\w+") %>%
map(str_detect, pattern = "evet"))

Creating a nested list obect

I have a dataframe as shown below ( 1st row column names, 2nd row data elements)
From
Col_Name Col_Child_1 Col_Grand_Child_1 Col_Great_Grand_Child_1 Col_Great_Grand_Child_Size1 Col_Great_Grand_Child_2 Col_Great_Grand_Child_Size2 Col_Great_Grand_Child_3 Col_Great_Grand_Child_Size3 Col_Great_Grand_Child_4 Col_Great_Grand_Child_Size4
Flare analytics cluster AgglomerativeCluster 3938 CommunityStructure 3812 HierarchicalCluster 6714 MergeEdge 743
I am trying to convert the data elements in the second row ( Flare, analytics....) into a nested list as shown below
> Flare
$name
[1] "flare"
$children
$children[[1]]
$children[[1]]$name
[1] "analytics"
$children[[1]]$children
$children[[1]]$children[[1]]
$children[[1]]$children[[1]]$name
[1] "cluster"
$children[[1]]$children[[1]]$children
$children[[1]]$children[[1]]$children[[1]]
$children[[1]]$children[[1]]$children[[1]]$name
[1] "AgglomerativeCluster"
$children[[1]]$children[[1]]$children[[1]]$size
[1] 3938
$children[[1]]$children[[1]]$children[[2]]
$children[[1]]$children[[1]]$children[[2]]$name
[1] "CommunityStructure"
$children[[1]]$children[[1]]$children[[2]]$size
[1] 3812
$children[[1]]$children[[1]]$children[[3]]
$children[[1]]$children[[1]]$children[[3]]$name
[1] "HierarchicalCluster"
$children[[1]]$children[[1]]$children[[3]]$size
[1] 6714
$children[[1]]$children[[1]]$children[[4]]
$children[[1]]$children[[1]]$children[[4]]$name
[1] "MergeEdge"
$children[[1]]$children[[1]]$children[[4]]$size
[1] 743
My attempts so far are very elementary and reflects my struggle :).
x = list(name= test1$Col_Name, children = c(test1$Col_Child_1)
Any help on solving this puzzle is much appreciated. Thanks in advance.
Below is the output from dput(test1)
test1 = structure(list(Col_Name = structure(2L, .Names = "row1", .Label = c("Col_Name",
"Flare"), class = "factor"), Col_Child_1 = structure(1L, .Names = "row1", .Label = c("analytics",
"Col_Child_1"), class = "factor"), Col_Grand_Child_1 = structure(1L, .Names = "row1", .Label = c("cluster",
"Col_Grand_Child_1"), class = "factor"), Col_Great_Grand_Child_1 = structure(1L, .Names = "row1", .Label = c("AgglomerativeCluster",
"Col_Great_Grand_Child_1"), class = "factor"), Col_Great_Grand_Child_Size1 = structure(1L, .Names = "row1", .Label = c("3938",
"Col_Great_Grand_Child_Size1"), class = "factor"), Col_Great_Grand_Child_2 = structure(2L, .Names = "row1", .Label = c("Col_Great_Grand_Child_2",
"CommunityStructure"), class = "factor"), Col_Great_Grand_Child_Size2 = structure(1L, .Names = "row1", .Label = c("3812",
"Col_Great_Grand_Child_Size2"), class = "factor"), Col_Great_Grand_Child_3 = structure(2L, .Names = "row1", .Label = c("Col_Great_Grand_Child_3",
"HierarchicalCluster"), class = "factor"), Col_Great_Grand_Child_Size3 = structure(1L, .Names = "row1", .Label = c("6714",
"Col_Great_Grand_Child_Size3"), class = "factor"), Col_Great_Grand_Child_4 = structure(2L, .Names = "row1", .Label = c("Col_Great_Grand_Child_4",
"MergeEdge"), class = "factor"), Col_Great_Grand_Child_Size4 = structure(1L, .Names = "row1", .Label = c("743",
"Col_Great_Grand_Child_Size4"), class = "factor")), .Names = c("Col_Name",
"Col_Child_1", "Col_Grand_Child_1", "Col_Great_Grand_Child_1",
"Col_Great_Grand_Child_Size1", "Col_Great_Grand_Child_2", "Col_Great_Grand_Child_Size2",
"Col_Great_Grand_Child_3", "Col_Great_Grand_Child_Size3", "Col_Great_Grand_Child_4",
"Col_Great_Grand_Child_Size4"), row.names = 2L, class = "data.frame")
the code below does not generalise well, so watch out and good luck with it :)
## get rid of factors
dat <- data.frame(lapply(dat, as.character), stringsAsFactors=FALSE)
## identify descendants -- hoping that the great grand children do not reproduce further
Children.names <- grep('Col_Child_[[:digit:]]', colnames(dat))
Grand_Children.names <- grep('Col_Grand_Child_[[:digit:]]', colnames(dat))
Great_Grand_Children.names <- grep('Col_Great_Grand_Child_[[:digit:]]', colnames(dat))
Great_Grand_Children.sizes <- grep('Col_Great_Grand_Child_Size[[:digit:]]', colnames(dat))
## putting it together into a list of lists (of lists)
nggc <- length(Great_Grand_Children.sizes)
ggc <- lapply(1:nggc, function(i) list(name=dat[1,Great_Grand_Children.names[i]], size=as.numeric(dat[Great_Grand_Children.sizes[i]])))
gc <- list(name=dat[1,Grand_Children.names[1]], children=ggc)
## fingers crossed now...
ll <- list(name=dat$Col_Name)
ll$children <- list( list(name=dat[1,Children.names[1]], children=gc) )

Resources