Reverse spread using gather - r

We have a rating matrix:
df <- data.frame(Customer.ID=c("c1",'c1','c1','c2','c2','c3'),
Movie.ID=c("m1", "m3", "m5", "m1", "m5", "m7"),
Rating=c(1,2,1,3,3,1))
df
Customer.ID Movie.ID Rating
1 c1 m1 1
2 c1 m3 2
3 c1 m5 1
4 c2 m1 3
5 c2 m5 3
6 c3 m7 1
When I spread and change row names like this:
df1 <- df %>% spread(key = 'Movie.ID', value = 'Rating')
df1 <- data.frame(df1, row.names = 'Customer.ID')
I get:
> df1
m1 m3 m5 m7
c1 1 2 1 NA
c2 3 NA 3 NA
c3 NA NA NA 1
I want to make df1 look like df again.
I have tried:
df2 <-setDT(df1, keep.rownames = TRUE)[]
df2 <- gather(df2, Video.ID, Rating, 2:4)
But it returns me:
> df2
rn m7 Video.ID Rating
1 c1 NA m1 1
2 c2 NA m1 3
3 c3 1 m1 NA
4 c1 NA m3 2
5 c2 NA m3 NA
6 c3 1 m3 NA
7 c1 NA m5 1
8 c2 NA m5 3
9 c3 1 m5 NA

While I am not certain why you are doing this (see #Jack Brookes comment), you can do this pretty readily with dplyr functions:
df1 %>%
rownames_to_column('Customer.ID') %>%
gather(m1:m7, key = 'Movie.ID', value = 'Rating') %>%
filter(!is.na(Rating))
Which gives us:
Customer.ID Movie.ID Rating
1 c1 m1 1
2 c2 m1 3
3 c1 m3 2
4 c1 m5 1
5 c2 m5 3
6 c3 m7 1

Related

Calculating cumulative sum of columns with loop

I have a dataframe with gene expression data by lane (column). What I would like to do is write a loop that takes the sum of each row but progressively adds in another column each time. So each time I loop through I add another column to my dataframe that contains the sums of each row plus another column to the end of the dataframe. In the example below I did this using the apply() function by hand but this is very inefficient and not feasible for a large data set. I messed around with the cumsum() function but couldn't seem to get it to work for this. Very possible I missed something obvious but any guidance would be great!
#Example dataframe
c1 <- c('G1', 'G2', 'G3')
c2 <- c(5, 3, 1)
c3 <- c(3, 7, 1)
c4 <- c(6, 3, 4)
c5 <- c(6, 4, 3)
df <- data.frame(c1, c2, c3, c4, c5)
#Cal cumulative sums
sum.2.3 <- apply(df[,2:3],1,sum)
sum.2.4 <- apply(df[,2:4],1,sum)
sum.2.5 <- apply(df[,2:5],1,sum)
df <- cbind(df, sum.2.3, sum.2.4, sum.2.5)
If the problem is the loop, you use apply inside it.
Code
start_col <- 2
end_col <- ncol(df)
for(i in (start_col+1):end_col){
var_name <- paste("sum",start_col,i,sep = ".")
df[,var_name] <- apply(df[,start_col:i],1,sum)
}
Output
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
You can use Reduce()
Reduce(`+`, df[-1], accumulate = TRUE)[-1]
[[1]]
[1] 8 10 2
[[2]]
[1] 14 13 6
[[3]]
[1] 20 17 9
Assign into the data frame:
df[paste0("sum.2.", 3:5)] <- Reduce(`+`, df[-1], accumulate = TRUE)[-1]
Gives:
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
No loop needed.
df <- data.frame(
c1 = c('G1', 'G2', 'G3'),
c2 = c(5, 3, 1),
c3 = c(3, 7, 1),
c4 = c(6, 3, 4),
c5 = c(6, 4, 3))
cbind(df, setNames(as.data.frame(t(apply(df[,-1], 1, cumsum))[,-1]), paste0("sum.2.", 3:5)))
#> c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
#> 1 G1 5 3 6 6 8 14 20
#> 2 G2 3 7 3 4 10 13 17
#> 3 G3 1 1 4 3 2 6 9
Using rowCumsums from matrixStats
library(matrixStats)
df[paste0("sum.2.", 3:5)] <- rowCumsums(as.matrix(df[2:5]))[,-1]
-output
> df
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
You can use both the mutate function from the dplyr package and the rowSums base function.
library(dplyr)
c1 <- c('G1', 'G2', 'G3')
c2 <- c(5, 3, 1)
c3 <- c(3, 7, 1)
c4 <- c(6, 3, 4)
c5 <- c(6, 4, 3)
df <- data.frame(c1, c2, c3, c4, c5)
df <- df %>%
dplyr::mutate(sum.2.3 = rowSums(across(c2:c3)),
sum.2.4 = rowSums(across(c2:c4)),
sum.2.5 = rowSums(across(c2:c5)))
Result
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9

How to write two vectors of different length into one data frame by writing same values into same row?

I want to write two vectors of different length with partly equal values into one data frame. The same values should be written in the same row.
ef1 <- c('A1', 'A2', 'B0', 'B1', 'C1', 'C2')
ef2 <- c('A1', 'A2', 'C1', 'C2', 'D1', 'D2')
If I write them in one data frame, it looks like this:
df <- data.frame (ef1, ef2)
> df
ef1 ef2
1 A1 A1
2 A2 A2
3 B0 C1
4 B1 C2
5 C1 D1
6 C2 D2
But what I want is this:
> df
ef1 ef2
1 A1 A1
2 A2 A2
3 B0 NA
4 B1 NA
5 C1 C1
6 C2 C2
7 NA D1
8 NA D2
I'm grateful for any help.
One option is match
(tmp <- unique(c(ef1, ef2)))
# [1] "A1" "A2" "B0" "B1" "C1" "C2" "D1" "D2"
out <- data.frame(ef1 = ef1[match(tmp, ef1)],
ef2 = ef2[match(tmp, ef2)])
Result
out
# ef1 ef2
#1 A1 A1
#2 A2 A2
#3 B0 <NA>
#4 B1 <NA>
#5 C1 C1
#6 C2 C2
#7 <NA> D1
#8 <NA> D2
Another solution, using dplyr's full_join. The idea is to artificially create a merging column and then make a full join.
ef1<-tibble(a=ef1,ef1=ef1)
ef2<-tibble(a=ef2,ef2=ef2)
ef1 %>%
full_join(ef2,by="a") %>%
select(ef1,ef2)
# A tibble: 8 x 2
ef1 ef2
<chr> <chr>
1 A1 A1
2 A2 A2
3 B0 NA
4 B1 NA
5 C1 C1
6 C2 C2
7 NA D1
8 NA D2

In R is there a way to recode the columns from one data frame with values from another data frame?

I am still relatively new to working in R and I am not sure how to approach this problem. Any help or advice is greatly appreciated!!!
The problem I have is that I am working with two data frames and I need to recode the first data frame with values from the second. The first data frame (df1) contains the data from the respondents to a survey and the other data frame(df2) is the data dictionary for df1.
The data looks like this:
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b1","b2","b3","c1","c2","c3"))
So far I can manually recode df1 to get the expected output by doing this:
df1 <- within(df1,{
a[a==1] <- "a1"
a[a==2] <- "a2"
a[a==3] <- "a3"
b[b==4] <- "b4"
b[b==5] <- "b5"
b[b==6] <- "b6"
c[c==7] <- "c7"
c[c==8] <- "c8"
c[c==9] <- "c9"
})
However my real dataset has about 42 columns that need to be recoded and that method is a little time intensive. Is there another way in R for me to recode the values in df1 with the values in df2?
Thanks!
Just need to transform the shape a bit.
library(data.table)
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b4","b5","b6","c7","c8","c9"),stringsAsFactors = FALSE)
setDT(df1)
setDT(df2)
df1[,ID:=.I]
ldf1 <- melt(df1,measure.vars = c("a","b","c"),variable.name = "columnIndicator",value.name = "df1_value")
ldf1[df2,"new_value":=i.new_value,on=.(columnIndicator,df1_value)]
ldf1
#> ID columnIndicator df1_value new_value
#> 1: 1 a 1 a1
#> 2: 2 a 2 a2
#> 3: 3 a 3 a3
#> 4: 1 b 4 b4
#> 5: 2 b 5 b5
#> 6: 3 b 6 b6
#> 7: 1 c 7 c7
#> 8: 2 c 8 c8
#> 9: 3 c 9 c9
dcast(ldf1,ID~columnIndicator,value.var = "new_value")
#> ID a b c
#> 1: 1 a1 b4 c7
#> 2: 2 a2 b5 c8
#> 3: 3 a3 b6 c9
Created on 2020-04-18 by the reprex package (v0.3.0)
In base R, we can unlist df1 match it with df1_value and get corresponding new_value.
df1[] <- df2$new_value[match(unlist(df1), df2$df1_value)]
df1
# a b c
#1 a1 b1 c1
#2 a2 b2 c2
#3 a3 b3 c3
Is this what you are looking for???
library(dplyr)
df3 <- df1 %>% gather(key = "key", value = "value")
df3 %>% inner_join(df2, by = c("key" = "columnIndicator", "value" = "df1_value"))
Output
key value new_value
1 a 1 a1
2 a 2 a2
3 a 3 a3
4 b 4 b1
5 b 5 b2
6 b 6 b3
7 c 7 c1
8 c 8 c2
9 c 9 c3

Melt a matrix using extern covariate in R

I have a matrix:
mat1 <- matrix(rnorm(8), ncol = 4;
,dimnames=list(c('R1','R2'),c('C1','C2','C3','C4')))
> mat1
C1 C2 C3 C4
R1 1.226139 -1.0604743 -0.1803689 0.3852505
R2 -1.232622 -0.5567295 -0.4146919 0.2433812
and a covariate that match names of the matrix columns
> covariate <- factor(c('A','A','B','B'))
> t(data.frame(covariate, colnames(mat1)))
[,1] [,2] [,3] [,4]
covariate "A" "A" "B" "B"
colnames.mat1. "C1" "C2" "C3" "C4"
I would like to melt it with respect to the covariate in order to have the following result:
Melting the data gives:
> melt( mat1 )
Var1 Var2 value
1 R1 C1 1.2261395
2 R2 C1 -1.2326215
3 R1 C2 -1.0604743
4 R2 C2 -0.5567295
5 R1 C3 -0.1803689
6 R2 C3 -0.4146919
7 R1 C4 0.3852505
8 R2 C4 0.2433812
However I would like to get the following result:
covariate_2 <- factor( c(rep('A',4) , rep('B',4) ))
> data.frame( covariate_2 , melted_data )
covariate_2 Var1 Var2 value
1 A R1 C1 1.2261395
2 A R2 C1 -1.2326215
3 A R1 C2 -1.0604743
4 A R2 C2 -0.5567295
5 B R1 C3 -0.1803689
6 B R2 C3 -0.4146919
7 B R1 C4 0.3852505
8 B R2 C4 0.2433812
I think there must be a way to get the results using standard melt function. I would appreciate any help.
Perhaps it's easiest to just rename the columns of your matrix first, and then melt.
Here are a couple of examples, first using "data.table", and second using the "tidyverse":
library(data.table)
setDT(melt(`colnames<-`(mat1, paste(c('A','A','B','B'), colnames(mat1), sep = "_"))))[
, c("cov", "V1") := tstrsplit(Var2, "_")][, Var2 := NULL][]
# Var1 value cov V1
# 1: R1 1.2261390 A C1
# 2: R2 -1.2326220 A C1
# 3: R1 -1.0604743 A C2
# 4: R2 -0.5567295 A C2
# 5: R1 -0.1803689 B C3
# 6: R2 -0.4146919 B C3
# 7: R1 0.3852505 B C4
# 8: R2 0.2433812 B C4
library(tidyverse)
`colnames<-`(mat1, paste(c('A','A','B','B'), colnames(mat1), sep = "_")) %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(var, val, -rowname) %>%
separate(var, into = c("cov", "var1"))
# rowname cov var1 val
# 1 R1 A C1 1.2261390
# 2 R2 A C1 -1.2326220
# 3 R1 A C2 -1.0604743
# 4 R2 A C2 -0.5567295
# 5 R1 B C3 -0.1803689
# 6 R2 B C3 -0.4146919
# 7 R1 B C4 0.3852505
# 8 R2 B C4 0.2433812
Sample data:
mat1 <- structure(c(1.226139, -1.232622, -1.0604743, -0.5567295, -0.1803689,
-0.4146919, 0.3852505, 0.2433812), .Dim = c(2L, 4L), .Dimnames = list(
c("R1", "R2"), c("C1", "C2", "C3", "C4")))

R Data wrangling

I am new to R, I have a csv file that contains values:
A, , ,
,B, ,
, ,C1,
, , ,D1
, , ,D2
, ,C2,
, , ,D3
, , ,D4
Loading the data into a data frame:
dat = read.csv("~/RData/test.csv", header = FALSE)
dat
# V1 V2 V3 V4
# 1 A
# 2 B
# 3 C1
# 4 D1
# 5 D2
# 6 C2
# 7 D3
# 8 D4
I need to wrangle this to a data frame format:
A,B,C1,D1
A,B,C1,D2
A,B,C2,D3
A,B,C2,D4
Thanks in advance!
A solution using dplyr and tidyr. This solution follows the link in Gregor's comments. But instead of using zoo package, here I show the use of fill function from tidyr, na.omit from base R, and distinct function from dplyr.
library(dplyr)
library(tidyr)
dt2 <- dt %>%
fill(everything(), .direction = "down") %>%
na.omit() %>%
distinct(V4, .keep_all = TRUE)
dt2
V1 V2 V3 V4
1 A B C1 D1
2 A B C1 D2
3 A B C2 D3
4 A B C2 D4
DATA
dt <- read.table(text = "V1 V2 V3 V4
1 A NA NA NA
2 NA B NA NA
3 NA NA C1 NA
4 NA NA NA D1
5 NA NA NA D2
6 NA NA C2 NA
7 NA NA NA D3
8 NA NA NA D4",
header = TRUE, stringsAsFactors = FALSE)
By using zoo
library(zoo)
df[df==' '] <- NA
df[1:3] <- lapply(df[1:3], na.locf0, fromLast = FALSE)
df <- df[!is.na(df$V4),]
df
giving:
V1 V2 V3 V4
4 A B C1 D1
5 A B C1 D2
7 A B C2 D3
8 A B C2 D4
or by using magrittr too we can write the above code in terms of this pipeline:
library(magrittr)
library(zoo)
df %>%
replace(. == ' ', NA) %>%
replace(1:3, lapply(.[1:3], na.locf0, fromLast = FALSE)) %>%
subset(!is.na(V4))

Resources