Consider this simple example
> weird_df <- data_frame(col1 =c('hello', 'world', 'again'),
+ col_weird = list(list(12,23), list(23,24), NA))
>
> weird_df
# A tibble: 3 x 2
col1 col_weird
<chr> <list>
1 hello <list [2]>
2 world <list [2]>
3 again <lgl [1]>
I need to extract the values in the col_weird. How can I do that? I see how to do that in Python but not in R. Expected output is:
> good_df
# A tibble: 3 x 3
col1 tic toc
<chr> <dbl> <dbl>
1 hello 12 23
2 world 23 24
3 again NA NA
If you collapse the list column into a string you can use separate from tidyr. I used map from purrr to loop through the list column and create a string with toString.
library(tidyr)
library(purrr)
weird_df %>%
mutate(col_weird = map(col_weird, toString ) ) %>%
separate(col_weird, into = c("tic", "toc"), convert = TRUE)
# A tibble: 3 x 3
col1 tic toc
* <chr> <int> <int>
1 hello 12 23
2 world 23 24
3 again NA NA
You can actually use separate directly without the toString part but you end up with "list" as one of the values.
weird_df %>%
separate(col_weird, into = c("list", "tic", "toc"), convert = TRUE) %>%
select(-list)
This led me to tidyr::extract, which works fine with the right regular expression. If your list column was more complicated, though, writing out the regular expression might be a pain.
weird_df %>%
extract(col_weird, into = c("tic", "toc"), regex = "([[:digit:]]+), ([[:digit:]]+)", convert = TRUE)
You can do this with basic R, thanks to I():
weird_df <- data.frame(col1 =c('hello', 'world'),
col_weird = I(list(list(12,23),list(23,24))))
weird_df
> col1 col_weird
1 hello 12, 23
2 world 23, 24
weird_df <- data_frame(col1 = c('hello', 'world'),
col_weird = list(list(12,23), list(23,24)))
library(dplyr)
weird_df %>%
dplyr::mutate(tic = unlist(magrittr::extract2(col_weird, 1)),
toc = unlist(magrittr::extract2(col_weird, 2)),
col_weird = NULL)
With the last changes: Note that now col_weird contains list(NA, NA)
weird_df <- data_frame(col1 = c('hello', 'world', 'again'),
col_weird = list(list(12,23), list(23,24), list(NA, NA)))
library(dplyr)
weird_df %>%
dplyr::mutate(col_weird = matrix(col_weird),
tic = sapply(col_weird, function(x) magrittr::extract2(x, 1)),
toc = sapply(col_weird, function(x) magrittr::extract2(x, 2)),
col_weird = NULL)
Here is one option to do with purrr/tidyverse/reshape2. We unlist the 'col_weird' within map to get the output as list, set the names of the list with 'col1', melt to 'long' format, grouped by 'L1', create a 'rn' column and spread it back to 'wide'
library(tidyverse)
library(reshape2)
weird_df$col_weird %>%
map(unlist) %>%
setNames(., weird_df$col1) %>%
melt %>%
group_by(L1) %>%
mutate(rn = c('tic', 'toc')[row_number()]) %>%
spread(rn, value) %>%
left_join(weird_df[-2], ., by = c(col1 = "L1"))
well, I came up with a simple one
> weird_df %>%
+ rowwise() %>%
+ mutate(tic = col_weird[[1]],
+ tac = ifelse(length(col_weird) == 2, col_weird[[2]], NA)) %>%
+ select(-col_weird) %>% ungroup()
# A tibble: 3 x 3
col1 tic tac
<chr> <dbl> <dbl>
1 hello 12 23
2 world 23 24
3 again NA NA
Related
I have a large dataframe. I'm trying to remove v character from variable names of a data frame
df <- tibble(q_ve5 = 1:2,
q_f_1v = 3:4,
q_vf_2 = 3:4,
q_e6 = 5:6,
q_ev8 = 5:6)
I tried this. It seems my regular expression pattern is not correct
df %>%
rename_all(~ str_remove(., "\\v\\d+$"))
My desired col names:
q_e5 q_f_1 q_f_2 q_e6 q_e8
If we need to remove only 'v' the one of more digits (\\d+) at the end ($) is not needed as the expected output also removes 'v' from first column 'q_ve5'
library(dplyr)
library(stringr)
df %>%
rename_with(~ str_remove(., "v"), everything())
-output
# A tibble: 2 × 5
q_e5 q_f_1 q_f_2 q_e6 q_e8
<int> <int> <int> <int> <int>
1 1 3 3 5 5
2 2 4 4 6 6
Or without any packages
names(df) <- sub("v", "", names(df))
I would like to combine two variables that have only one answer each into a single variable that has both answers.
Example
IPV_YES only has answers that are 1
IPV_NO only has answers that are 2
I would like to combine them into a single variable named IPV that would have the 1 and 2 results from both individual category.
I have tried using ifelse command but it only shows me the value of IPV_YES.
Dataset I have
My desired outcome
my answer
df %>% mutate(across(everything(), ~ifelse(. == "", NA, as.numeric(.)))) %>%
group_by(ID) %>%
rowwise() %>%
transmute(IPV = sum(c_across(everything()), na.rm = T))
# A tibble: 4 x 2
# Rowwise: ID
ID IPV
<dbl> <dbl>
1 1 1
2 2 2
3 3 1
4 4 2
data
df <- data.frame(ID = 1:4, IPV_YES = c(1,"",1,""), IPV_NO = c("",2,"",2))
We can use coalesce after converting the '' to NA
library(dplyr)
df <- df %>%
transmute(ID, IPV = coalesce(na_if(IPV_YES, ""), na_if(IPV_NO, ""))) %>%
type.convert(as.is = TRUE)
data
df <- data.frame(ID = 1:4, IPV_YES = c(1,"",1,""), IPV_NO = c("",2,"",2))
df$IPV <- ifelse(df$IPV_YES != "", df$IPV_YES, df$IPV_NO[!df$IPV_NO==""])
Here, we specify an ifelse statement; it can be glossed thus: if the value in df$IPV_YES is not blank, then give the value in df$IPV_YES, else give those values from df$IPV_NO that are not blank.
If you want to remove the IPV_* columns:
df[,2:3] <- NULL
Result:
df
ID IPV
1 1 1
2 2 2
3 3 1
4 4 2
Data:
df <- data.frame(ID = 1:4, IPV_YES = c(1,"",1,""), IPV_NO = c("",2,"",2))
Maybe you can try the code below
replace(df, df == "", NA) %>%
mutate(IPV = coalesce(IPV_YES, IPV_NO)) %>%
select(ID, IPV) %>%
type.convert(as.is = TRUE)
which gives
ID IPV
1 1 1
2 2 2
3 3 1
4 4 2
data=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'))
Say you have data as shown above and you wish for summaries like this,
What is the effective solution to do this and output to a csv in organized way such as shown above?
Example data if there is weights involved and you wanted weighted counts and porporitons.portions.
data1=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'),
"Weighting"=c(10,6,13,12,2))
One option is map
library(dplyr)
library(purrr)
map_dfr(names(data)[2:3], ~
data %>%
select(.x) %>%
group_by_at(.x) %>%
summarise(COUNT = n()) %>%
mutate(PROP = COUNT/sum(COUNT)))
# A tibble: 5 x 4
# Class COUNT PROP Type
#* <dbl> <int> <dbl> <fct>
#1 1 1 0.2 <NA>
#2 2 2 0.4 <NA>
#3 3 2 0.4 <NA>
#4 NA 2 0.4 A
#5 NA 3 0.6 B
Or with data.table by melting into 'long' format
library(data.table)
melt(setDT(data), id.var = 'StudentID')[, .(COUNT = .N),
.(variable, value)][, PROP := COUNT/sum(COUNT),.(variable)][]
Or with base R using table and prop.table
lapply(data[-1], function(x) {x1 <- table(x); x2 <- prop.table(x1); cbind(COUNT = x1, PROP = x2)})
Both summaries are simple, here I use dplyr. To combine them in the way you want, it's going to need to be slapped together in a somewhat inelegant way. You can remove the name col1 if you want
library(dplyr)
df1 <- data %>% group_by(Class) %>%
summarise(Count = n(), Prop = n() / nrow(data))
df2 <- data %>% group_by(Type) %>%
summarise(Count = n(), Prop = n() / nrow(data))
names(df1)[1] <- 'col1'
names(df2)[1] <- 'col1'
rbind(
c('Class', '', ''),
df1,
c('Type', '', ''),
df2
)
# A tibble: 7 x 3
col1 Count Prop
<chr> <chr> <chr>
1 Class "" ""
2 1 1 0.2
3 2 2 0.4
4 3 2 0.4
5 Type "" ""
6 A 2 0.4
7 B 3 0.6
I'm working with json data which I've converted into a tibble with some list columns. I'm trying to extract the useful information from the list columns but am facing issues. If given the following dataset-
mydf <-tibble(
x = c(1, 2, 3),
y = list(list(list(id="id1", title="title1"), list(id="id11", title="title11")),
list(id="id2",title="title2"),
NULL)
)
How can I convert it into the following-
data.frame(x=c(1:3), id = c("id1;id11", "id2", ""), title = c("title1;title11", "title2", ""))
# x id title
#1 1 id1;id11 title1;title11
#2 2 id2 title2
#3 3
Any help is appreciated. Thanks!
I think there are better ways, but this is what I can do for now. For each row, I extracted strings and concatenated them with toString(). Since unnest() creates multiple rows for each row (i.e., 1, 2, and 3 in x), I used summarize() to temporarily combine strings. Then, I separate them using separate().
mydf %>%
unnest(y, keep_empty = TRUE) %>%
rowwise %>%
mutate(y = toString(unlist(y))) %>%
group_by(x) %>%
summarize(string = paste(y, collapse = "_")) %>%
separate(col = string, into = c("id", "title"), sep = "_")
# x id title
# <dbl> <chr> <chr>
#1 1 id1, title1 id11, title11
#2 2 id2 title2
#3 3 "" NA
If the names are consistent as in the example, you can do:
mydf2 <- unlist(mydf)
x <- mydf2[grepl("x", names(mydf2))]
id <- mydf2[grepl("id", names(mydf2))]
title <- mydf2[grepl("title", names(mydf2))]
tibble(x, id, title)
# A tibble: 3 x 3
x id title
<chr> <chr> <chr>
1 1 id1 title1
2 2 id11 title11
3 3 id2 title2
I am trying to convert the following list to a dataframe.
I have tried melt/cast, ldply, unlist etc but can't seem to get the expected output.
Many thanks in advance!
df <- list(
name=rep(c(11,12), each=1),
value=rnorm(2),
name=rep(c(13,14), each=1),
value=rnorm(2)
)
df
I want the following output in a dataframe:
name value
11 1.187
12 0.691
13 0.452
14 0.898
An option is to stack into a two column data.frame, and spread it back to 'wide' format
library(tidyverse)
enframe(df) %>%
unnest(value) %>%
group_by(name) %>%
mutate(rn = row_number()) %>%
spread(name, value) %>%
select(-rn)
# A tibble: 4 x 2
# name value
# <dbl> <dbl>
#1 11 -0.484
#2 12 -0.110
#3 13 -0.328
#4 14 0.0737
Or another option is to make use of pivot_longer from the devel version of tidyr
df %>%
set_names(str_c(names(.), "_", cumsum(names(.) == "name"))) %>%
as_tibble %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, names_to =c(".value", "group"), names_sep = '_') %>%
select(name, value)
Or using base R
reshape(transform(stack(df), rn = ave(seq_along(ind), ind,
FUN = seq_along)), idvar = 'rn', direction = 'wide', timevar = 'ind')
Here's a way in base R using split -
data.frame(
split(v <- unlist(df), sub("[0-9]+", "", names(v)))
)
name value
1 11 -0.2282623
2 12 -0.8101849
3 13 -0.9311898
4 14 0.3638835
Data -
df <- structure(list(name = c(11, 12), value = c(-0.22826229127103,
-0.810184913338659), name = c(13, 14), value = c(-0.931189778412408,
0.363883463286674)), .Names = c("name", "value", "name", "value"
))
d <- data.frame(
name = unlist(df[names(df) == "name"]),
value = unlist(df[names(df) == "value"])
)
the_list <- list(
name=rep(c(11,12), each=1),
value=rnorm(2),
name=rep(c(13,14), each=1),
value=rnorm(2)
)
df <- data.frame(name = unlist(the_list[which(names(the_list) == "name")]),
value = unlist(the_list[which(names(the_list) == "value")]))
df
# name value
# 1 11 -0.83130395
# 2 12 -0.12782566
# 3 13 2.59769395
# 4 14 -0.06967617