I have a matrix:
mat1 <- matrix(rnorm(8), ncol = 4;
,dimnames=list(c('R1','R2'),c('C1','C2','C3','C4')))
> mat1
C1 C2 C3 C4
R1 1.226139 -1.0604743 -0.1803689 0.3852505
R2 -1.232622 -0.5567295 -0.4146919 0.2433812
and a covariate that match names of the matrix columns
> covariate <- factor(c('A','A','B','B'))
> t(data.frame(covariate, colnames(mat1)))
[,1] [,2] [,3] [,4]
covariate "A" "A" "B" "B"
colnames.mat1. "C1" "C2" "C3" "C4"
I would like to melt it with respect to the covariate in order to have the following result:
Melting the data gives:
> melt( mat1 )
Var1 Var2 value
1 R1 C1 1.2261395
2 R2 C1 -1.2326215
3 R1 C2 -1.0604743
4 R2 C2 -0.5567295
5 R1 C3 -0.1803689
6 R2 C3 -0.4146919
7 R1 C4 0.3852505
8 R2 C4 0.2433812
However I would like to get the following result:
covariate_2 <- factor( c(rep('A',4) , rep('B',4) ))
> data.frame( covariate_2 , melted_data )
covariate_2 Var1 Var2 value
1 A R1 C1 1.2261395
2 A R2 C1 -1.2326215
3 A R1 C2 -1.0604743
4 A R2 C2 -0.5567295
5 B R1 C3 -0.1803689
6 B R2 C3 -0.4146919
7 B R1 C4 0.3852505
8 B R2 C4 0.2433812
I think there must be a way to get the results using standard melt function. I would appreciate any help.
Perhaps it's easiest to just rename the columns of your matrix first, and then melt.
Here are a couple of examples, first using "data.table", and second using the "tidyverse":
library(data.table)
setDT(melt(`colnames<-`(mat1, paste(c('A','A','B','B'), colnames(mat1), sep = "_"))))[
, c("cov", "V1") := tstrsplit(Var2, "_")][, Var2 := NULL][]
# Var1 value cov V1
# 1: R1 1.2261390 A C1
# 2: R2 -1.2326220 A C1
# 3: R1 -1.0604743 A C2
# 4: R2 -0.5567295 A C2
# 5: R1 -0.1803689 B C3
# 6: R2 -0.4146919 B C3
# 7: R1 0.3852505 B C4
# 8: R2 0.2433812 B C4
library(tidyverse)
`colnames<-`(mat1, paste(c('A','A','B','B'), colnames(mat1), sep = "_")) %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(var, val, -rowname) %>%
separate(var, into = c("cov", "var1"))
# rowname cov var1 val
# 1 R1 A C1 1.2261390
# 2 R2 A C1 -1.2326220
# 3 R1 A C2 -1.0604743
# 4 R2 A C2 -0.5567295
# 5 R1 B C3 -0.1803689
# 6 R2 B C3 -0.4146919
# 7 R1 B C4 0.3852505
# 8 R2 B C4 0.2433812
Sample data:
mat1 <- structure(c(1.226139, -1.232622, -1.0604743, -0.5567295, -0.1803689,
-0.4146919, 0.3852505, 0.2433812), .Dim = c(2L, 4L), .Dimnames = list(
c("R1", "R2"), c("C1", "C2", "C3", "C4")))
Related
I have example data as follows:
library(data.table)
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B NA
E NA")
For the two rows that have Variable_codes_2022==NA, I would like to increment the variable code so that it becomes:
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B B4
E E2"
Because the column Variable_codes_2022 is a string variable, the numbers are not in numerical order.
I have no idea where to start and I was wondering if someone could help me on the right track.
We could do it this way:
grouping
arranging and
mutate.
To keep the original order we could first create and id and then rearrange:
library(dplyr)
dat %>%
group_by(Survey) %>%
arrange(.by_group = TRUE) %>%
mutate(Variable_codes_2022 = paste0(Survey, row_number()))
Survey Variable_codes_2022
<chr> <chr>
1 A A1
2 B B1
3 B B2
4 B B3
5 B B4
6 D D1
7 E E1
8 E E2
data.table option using rleid like this:
library(data.table)
dat[, Variable_codes_2022 := paste0(Survey, rleid(Variable_codes_2022)), by = Survey]
dat
#> Survey Variable_codes_2022
#> 1: D D1
#> 2: A A1
#> 3: B B1
#> 4: B B2
#> 5: B B3
#> 6: E E1
#> 7: B B4
#> 8: E E2
Created on 2022-12-01 with reprex v2.0.2
dat <-
structure(list(survey = c("D", "A", "B", "B", "B", "E", "B",
"E", "B"), var_code = c("D1", "A1", "B1", "B3", "B2", "E1", NA,
NA, NA)), row.names = c(NA, -9L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000026db10f1ef0>)
library(dplyr)
library(stringr)
dat %>%
group_by(survey) %>%
mutate(
aux1 = as.numeric(stringr::str_remove(var_code,survey)),
aux2 = cumsum(is.na(var_code)),
var_code = paste0(survey,max(aux1,na.rm = TRUE)+aux2)
) %>%
ungroup() %>%
select(-aux1,-aux2)
# A tibble: 9 x 2
survey var_code
<chr> <chr>
1 D D1
2 A A1
3 B B3
4 B B3
5 B B3
6 E E1
7 B B4
8 E E2
9 B B5
This solution with rowid.
Added an extra element to the sample so it can be tested against multiple missings
library(data.table)
#> Warning: package 'data.table' was built under R version 4.2.2
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B NA
E NA
E NA")
dat[, n := as.numeric(substr(
Variable_codes_2022, nchar(Survey)+1, nchar(Variable_codes_2022)))]
dat[is.na(n),
Variable_codes_2022 := paste0(Survey, rowid(Survey) +
dat[.SD[,.(Survey)], .(m=max(n, na.rm=T)), on = "Survey", by=.EACHI ][,m])]
dat
#> Survey Variable_codes_2022 n
#> 1: D D1 1
#> 2: A A1 1
#> 3: B B1 1
#> 4: B B3 3
#> 5: B B2 2
#> 6: E E1 1
#> 7: B B4 NA
#> 8: E E2 NA
#> 9: E E3 NA
I got a nested list l with each item each self is a 2 level list. For example:
l1 = list("a", list("a1"= "a1v"))
l2 = list("b", list("b1" = "b1v", b2 = "b2v"))
l3 = list("c", list("c1" = c("c1v1", "c1v2", "c1v3")))
l = list(l1, l2, l3)
How do I tranform it to a data.frame like this:
df = data.frame(A = c("a", "b", "b", "c", "c", "c"), B= c("a1", "b1", "b2", "c1", "c1", "c1"), C=c("a1v", "b1v", "b2v", "c1v1", "c1v2", "c1v3"))
> df
A B C
1 a a1 a1v
2 b b1 b1v
3 b b2 b2v
4 c c1 c1v1
5 c c1 c1v2
6 c c1 c1v3
Tried with seperate_rows and map_df but both failed to deal with inconsistent dimension of .x[[2]] items.
Update 1:
#akrun's solution is not running for me:
We could use bind_rows with map
library(purrr)
library(dplyr)
library(tidyr)
map_dfr(l, ~bind_cols(.x) %>%
pivot_longer(cols = -1, names_to = 'B', values_to = 'C') %>%
rename_at(1, ~'A'))
# A tibble: 6 x 3
# A B C
#* <chr> <chr> <chr>
#1 a a1 a1v
#2 b b1 b1v
#3 b b2 b2v
#4 c c1 c1v1
#5 c c1 c1v2
#6 c c1 c1v3
If the sample data in your question accurately reflects your actual data, you can try one of the following:
library(data.table)
data.table(l)[, list(names(unlist(l)),
unlist(l, use.names = FALSE))][
, V3 := V2[1], cumsum(V1 == "")][V1 != ""]
## V1 V2 V3
## 1: a1 a1v a
## 2: b1 b1v b
## 3: b2 b2v b
## 4: c11 c1v1 c
## 5: c12 c1v2 c
## 6: c13 c1v3 c
reshape2::melt(setNames(lapply(l, "[[", -1), lapply(l, "[[", 1)))
## value L2 L1
## 1 a1v a1 a
## 2 b1v b1 b
## 3 b2v b2 b
## 4 c1v1 c1 c
## 5 c1v2 c1 c
## 6 c1v3 c1 c
Base R option :
do.call(rbind, lapply(l, function(x) {
data.frame(A = x[[1]], B = unlist(x[[2]]), C = names(x[[2]]))
}))
# A B C
#a1 a a1v a1
#b1 b b1v b1
#b2 b b2v b2
#c11 c c1v1 c1
#c12 c c1v2 c1
#c13 c c1v3 c1
Since this is also one of the solution, I will post it here as well. This one is the one I can relate to.
map_df(l, ~ tibble(A=.x[[1]], B=names(.x[[2]]), C= unlist(.x[[2]])))
Read:
Run through all elements of l and make a data.frame (map_df and ~ inside) from a sub-data.frame created by tibble where column A = ..., B = ..`, ...
Thanks go to:
#akrun for prompt answer, I could have used the solution, but was
too busy to figure out.
#A5C1D2H2I1M1N2O1R2T1 also provided a
performant answer.
#Ronak Shah provided a plain R base
solution that I can translate to this.
I am still relatively new to working in R and I am not sure how to approach this problem. Any help or advice is greatly appreciated!!!
The problem I have is that I am working with two data frames and I need to recode the first data frame with values from the second. The first data frame (df1) contains the data from the respondents to a survey and the other data frame(df2) is the data dictionary for df1.
The data looks like this:
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b1","b2","b3","c1","c2","c3"))
So far I can manually recode df1 to get the expected output by doing this:
df1 <- within(df1,{
a[a==1] <- "a1"
a[a==2] <- "a2"
a[a==3] <- "a3"
b[b==4] <- "b4"
b[b==5] <- "b5"
b[b==6] <- "b6"
c[c==7] <- "c7"
c[c==8] <- "c8"
c[c==9] <- "c9"
})
However my real dataset has about 42 columns that need to be recoded and that method is a little time intensive. Is there another way in R for me to recode the values in df1 with the values in df2?
Thanks!
Just need to transform the shape a bit.
library(data.table)
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b4","b5","b6","c7","c8","c9"),stringsAsFactors = FALSE)
setDT(df1)
setDT(df2)
df1[,ID:=.I]
ldf1 <- melt(df1,measure.vars = c("a","b","c"),variable.name = "columnIndicator",value.name = "df1_value")
ldf1[df2,"new_value":=i.new_value,on=.(columnIndicator,df1_value)]
ldf1
#> ID columnIndicator df1_value new_value
#> 1: 1 a 1 a1
#> 2: 2 a 2 a2
#> 3: 3 a 3 a3
#> 4: 1 b 4 b4
#> 5: 2 b 5 b5
#> 6: 3 b 6 b6
#> 7: 1 c 7 c7
#> 8: 2 c 8 c8
#> 9: 3 c 9 c9
dcast(ldf1,ID~columnIndicator,value.var = "new_value")
#> ID a b c
#> 1: 1 a1 b4 c7
#> 2: 2 a2 b5 c8
#> 3: 3 a3 b6 c9
Created on 2020-04-18 by the reprex package (v0.3.0)
In base R, we can unlist df1 match it with df1_value and get corresponding new_value.
df1[] <- df2$new_value[match(unlist(df1), df2$df1_value)]
df1
# a b c
#1 a1 b1 c1
#2 a2 b2 c2
#3 a3 b3 c3
Is this what you are looking for???
library(dplyr)
df3 <- df1 %>% gather(key = "key", value = "value")
df3 %>% inner_join(df2, by = c("key" = "columnIndicator", "value" = "df1_value"))
Output
key value new_value
1 a 1 a1
2 a 2 a2
3 a 3 a3
4 b 4 b1
5 b 5 b2
6 b 6 b3
7 c 7 c1
8 c 8 c2
9 c 9 c3
We have a rating matrix:
df <- data.frame(Customer.ID=c("c1",'c1','c1','c2','c2','c3'),
Movie.ID=c("m1", "m3", "m5", "m1", "m5", "m7"),
Rating=c(1,2,1,3,3,1))
df
Customer.ID Movie.ID Rating
1 c1 m1 1
2 c1 m3 2
3 c1 m5 1
4 c2 m1 3
5 c2 m5 3
6 c3 m7 1
When I spread and change row names like this:
df1 <- df %>% spread(key = 'Movie.ID', value = 'Rating')
df1 <- data.frame(df1, row.names = 'Customer.ID')
I get:
> df1
m1 m3 m5 m7
c1 1 2 1 NA
c2 3 NA 3 NA
c3 NA NA NA 1
I want to make df1 look like df again.
I have tried:
df2 <-setDT(df1, keep.rownames = TRUE)[]
df2 <- gather(df2, Video.ID, Rating, 2:4)
But it returns me:
> df2
rn m7 Video.ID Rating
1 c1 NA m1 1
2 c2 NA m1 3
3 c3 1 m1 NA
4 c1 NA m3 2
5 c2 NA m3 NA
6 c3 1 m3 NA
7 c1 NA m5 1
8 c2 NA m5 3
9 c3 1 m5 NA
While I am not certain why you are doing this (see #Jack Brookes comment), you can do this pretty readily with dplyr functions:
df1 %>%
rownames_to_column('Customer.ID') %>%
gather(m1:m7, key = 'Movie.ID', value = 'Rating') %>%
filter(!is.na(Rating))
Which gives us:
Customer.ID Movie.ID Rating
1 c1 m1 1
2 c2 m1 3
3 c1 m3 2
4 c1 m5 1
5 c2 m5 3
6 c3 m7 1
This is my First dataframe,
df1 <- as.data.frame(matrix(rbinom(9*9, 1, 0.5), ncol=9, nrow =9))
colnames(df1) <- paste(rep(c("a","b","c"), each=3), rep(c(1,2,3), 3), sep = "")
set.seed(11)
This is my Second dataframe,
factor.1 <- paste(rep(c("a","b"), each=3), rep(c(1,2,3), 2), sep = "")
factor.2 <- rep(paste(rep("c", 3), c(1,2,3), sep = ""), 2)
df2 <- as.data.frame(cbind(factor.1,factor.2))
I want to calculate the result in each column and put it inside the second dataframe. I use dplyr
fun1 <- function(x){sum(ds1[, x])}
df2%>% mutate(value = fun1(factor.1))
But what I get is this,
factor.1 factor.2 value
1 a1 c1 22
2 a2 c2 22
3 a3 c3 22
4 b1 c1 22
5 b2 c2 22
6 b3 c3 22
But What I want is this,
factor.1 factor.2 value
1 a1 c1 4
2 a2 c2 4
3 a3 c3 4
4 b1 c1 1
5 b2 c2 4
6 b3 c3 5
Is this what you are looking for ?
df2 %>% mutate(value = sapply(factor.1, fun1) )