Melt 4 columns into 3 while combining the two variable columns - r

The following sample data.frame:
Date <- seq(as.Date("2016/9/1"), as.Date("2016/9/10"), "days")
A <- sample(0:200, 10)
B <- sample(0:400, 10)
A_limit <- rep(200,10)
B_limit <- rep(400,10)
data_sample <- data.frame(Date,A,B,A_limit,B_limit)
> Date A B A_limit B_limit
1 2016-09-01 175 270 200 400
2 2016-09-02 160 50 200 400
3 2016-09-03 173 25 200 400
...
and I would like to reshape it into the form:
> Date limit variable value
1 2016-09-01 200 A 175
2 2016-09-02 200 A 160
3 2016-09-03 200 A 173
...
31 2016-09-01 400 B 270
32 2016-09-02 400 B 50
33 2016-09-03 400 B 25
....
I manage to get it done but it seems to me my way is far to complicated:
library("reshape2")
data_sample_2 <- data_sample %>% melt(id=c("Date","A","B"))
levels(data_sample_2$variable) <- c(levels(data_sample_2$variable),"A","B")
data_sample_2$variable[data_sample_2$variable == "A_limit"] <- as.factor("A")
data_sample_2$variable[data_sample_2$variable == "B_limit"] <- as.factor("B")
names(data_sample_2)[names(data_sample_2) == "value"] <- "limit"
names(data_sample_2)[names(data_sample_2) == "variable"] <- "variable_1"
data_sample_3 <- data_sample_2 %>% melt(id=c("Date","variable_1","limit"))
data_sample_3 <- droplevels(data_sample_3)
data_sample_4 <- data_sample_3[data_sample_3$variable_1 == data_sample_3$variable,]
data_sample_4$variable_1 <- NULL
I just started using the reshape2 package so please let me know about any way i can improve this data.frame transformation (no matter how obvious it may seem).

You can do this via base R simply by stacking everything, i.e.
df1 <- data.frame(Date = data_sample$Date, limit = stack(data_sample[-(1:3)])[[1]],
variable = stack(data_sample[2:3])[[2]],
value = stack(data_sample[2:3])[[1]],
stringsAsFactors = FALSE)
head(df1)
# Date limit variable value
#1 2016-09-01 200 A 67
#2 2016-09-02 200 A 100
#3 2016-09-03 200 A 166
#4 2016-09-04 200 A 116
#5 2016-09-05 200 A 89
#6 2016-09-06 200 A 138
tail(df1)
# Date limit variable value
#15 2016-09-05 400 B 208
#16 2016-09-06 400 B 387
#17 2016-09-07 400 B 125
#18 2016-09-08 400 B 116
#19 2016-09-09 400 B 120
#20 2016-09-10 400 B 241

Is this what you want?
data_sample_2 <- melt(data_sample,id.vars=c("Date","A_limit","B_limit"))
data_sample_2$limit<- ifelse(data_sample_2$variable=="A",data_sample_2$A_limit,data_sample_2$B_limit)
data_sample_2[,c("Date","limit","variable","value")]

Since you used reshape2 in your example, it might interest you to see how to handle it in the (more updated) tidyverse setup.
I'll repeat your generation code:
Date <- seq(as.Date("2016/9/1"), as.Date("2016/9/10"), "days")
A <- sample(0:200, 10)
B <- sample(0:400, 10)
A_limit <- rep(200,10)
B_limit <- rep(400,10)
data_sample <- data.frame(Date,A,B,A_limit,B_limit)
# Preview
head(data_sample)
#> Date A B A_limit B_limit
#> 1 2016-09-01 39 53 200 400
#> 2 2016-09-02 96 193 200 400
#> 3 2016-09-03 143 75 200 400
#> 4 2016-09-04 60 241 200 400
#> 5 2016-09-05 126 225 200 400
#> 6 2016-09-06 184 349 200 400
Now we can use dplyr and tidyr (which take on much of the responsibilities that reshape2 has) to manipulate the data in a "clear" way.
library(dplyr)
library(tidyr)
data_clean <- data_sample %>%
gather(variable, value, A, B) %>%
mutate(limit = if_else(variable == "A", A_limit, B_limit)) %>%
select(Date, limit, variable, value)
# Inspect results
head(data_clean)
#> Date limit variable value
#> 1 2016-09-01 200 A 39
#> 2 2016-09-02 200 A 96
#> 3 2016-09-03 200 A 143
#> 4 2016-09-04 200 A 60
#> 5 2016-09-05 200 A 126
#> 6 2016-09-06 200 A 184

Related

Combine dataframes only by mutual rownames

I want to combine about 20 dataframes, with different lengths of rows and columns, only by the mutual rownames. Any rows that are not shared for ALL dataframes are deleted. So for example on two dataframes:
Patient1 Patient64 Patient472
ABC 28 38 0
XYZ 92 11 998
WWE 1 10 282
ICQ 0 76 56
SQL 22 1002 778
combine with
Pat_9 Pat_1 Pat_111
ABC 65 44 874
CBA 3 311 998
WWE 2 1110 282
vVv 2 760 56
GHG 12 1200 778
The result would be
Patient1 Patient64 Patient472 Pat_9 Pat_1 Pat_111
ABC 28 38 0 65 44 874
WWE 1 10 282 2 1110 282
I know how to use rbind and cbind but not for the purpose of joining according to shared rownames.
Try this considering change list arguments to df1 , df2 , df3 , ... , df20 your data.frames
l <- lapply(list(df1 , df2 ) , \(x) {x[["id"]] <- rownames(x) ; x})
Reduce(\(x,y) merge(x,y , by = "id") , l)
you can try
merge(d1, d2, by = "row.names")
Row.names Patient1 Patient64 Patient472 Pat_9 Pat_1 Pat_111
1 ABC 28 38 0 65 44 874
2 WWE 1 10 282 2 1110 282
for more than two you can use a tidyverse
library(tidyverse)
lst(d1, d2, d2) %>%
map(rownames_to_column) %>%
reduce(inner_join, by="rowname")
You can first turn your rownames_to_column and use a inner_join and at last convert column_to_rownames back like this:
df1 <- read.table(text=" Patient1 Patient64 Patient472
ABC 28 38 0
XYZ 92 11 998
WWE 1 10 282
ICQ 0 76 56
SQL 22 1002 778", header = TRUE)
df2 <- read.table(text = " Pat_9 Pat_1 Pat_111
ABC 65 44 874
CBA 3 311 998
WWE 2 1110 282
vVv 2 760 56
GHG 12 1200 778", header = TRUE)
library(dplyr)
library(tibble)
df1 %>%
rownames_to_column() %>%
inner_join(df2 %>% rownames_to_column(), by = "rowname") %>%
column_to_rownames()
#> Patient1 Patient64 Patient472 Pat_9 Pat_1 Pat_111
#> ABC 28 38 0 65 44 874
#> WWE 1 10 282 2 1110 282
Created on 2022-07-20 by the reprex package (v2.0.1)
Option with list of dataframes:
dfs_list <- list(df1, df2)
transform(Reduce(merge, lapply(dfs_list, function(x) data.frame(x, rn = row.names(x)))), row.names=rn, rn=NULL)
#> Patient1 Patient64 Patient472 Pat_9 Pat_1 Pat_111
#> ABC 28 38 0 65 44 874
#> WWE 1 10 282 2 1110 282
Created on 2022-07-20 by the reprex package (v2.0.1)

Pivot/Reshape data in R [duplicate]

This question already has answers here:
Reshape horizontal to to long format using pivot_longer
(3 answers)
Closed 2 years ago.
Thank you all for your answers, I thought I was smarter than I am and hoped I would've understood any of it. I think I messed up my visualisation of my data aswell. I have edited my post to better show my sample data. Sorry for the inconvenience, and I truly hope that someone can help me.
I have a question about reshaping my data. The data collected looks as such:
data <- read.table(header=T, text='
pid measurement1 Tdays1 measurement2 Tdays2 measurement3 Tdays3 measurment4 Tdays4
1 1356 1435 1483 1405 1563 1374 NA NA
2 943 1848 1173 1818 1300 1785 NA NA
3 1590 185 NA NA NA NA 1585 294
4 130 72 443 70 NA NA 136 79
4 140 82 NA NA NA NA 756 89
4 220 126 266 124 NA NA 703 128
4 166 159 213 156 476 145 776 166
4 380 189 583 173 NA NA 586 203
4 353 231 510 222 656 217 526 240
4 180 268 NA NA NA NA NA NA
4 NA NA NA NA NA NA 580 278
4 571 334 596 303 816 289 483 371
')
Now i would like it to look something like this:
PID Time Value
1 1435 1356
1 1405 1483
1 1374 1563
2 1848 943
2 1818 1173
2 1785 1300
3 185 1590
... ... ...
How would i tend to get there? I have looked up some things about wide to longformat, but it doesn't seem to do the trick. Am reletively new to Rstudio and Stackoverflow (if you couldn't tell that already).
Kind regards, and thank you in advance.
Here is a slightly different pivot_longer() version.
library(tidyr)
library(dplyr)
dw %>%
pivot_longer(cols = -PID, names_to =".value", names_pattern = "(.+)[0-9]")
# A tibble: 9 x 3
PID T measurement
<dbl> <dbl> <dbl>
1 1 1 100
2 1 4 200
3 1 7 50
4 2 2 150
5 2 5 300
6 2 8 60
7 3 3 120
8 3 6 210
9 3 9 70
The names_to = ".value" argument creates new columns from column names based on the names_pattern argument. The names_pattern argument takes a special regex input. In this case, here is the breakdown:
(.+) # match everything - anything noted like this becomes the ".values"
[0-9] # numeric characters - tells the pattern that the numbers
# at the end are excluded from ".values". If you have multiple digit
# numbers, use [0-9*]
In the last edit you asked for a solution that is easy to understand. A very simple approach would be to stack the measurement columns on top of each other and the Tdays columns on top of each other. Although specialty packages make things very concise and elegant, for simplicity we can solve this without additional packages. Standard R has a convenient function aptly named stack, which works like this:
> exp <- data.frame(value1 = 1:5, value2 = 6:10)
> stack(exp)
values ind
1 1 value1
2 2 value1
3 3 value1
4 4 value1
5 5 value1
6 6 value2
7 7 value2
8 8 value2
9 9 value2
10 10 value2
We can stack measurements and Tdays seperately and then combine them via cbind:
data <- read.table(header=T, text='
pid measurement1 Tdays1 measurement2 Tdays2 measurement3 Tdays3 measurement4 Tdays4
1 1356 1435 1483 1405 1563 1374 NA NA
2 943 1848 1173 1818 1300 1785 NA NA
3 1590 185 NA NA NA NA 1585 294
4 130 72 443 70 NA NA 136 79
4 140 82 NA NA NA NA 756 89
4 220 126 266 124 NA NA 703 128
4 166 159 213 156 476 145 776 166
4 380 189 583 173 NA NA 586 203
4 353 231 510 222 656 217 526 240
4 180 268 NA NA NA NA NA NA
4 NA NA NA NA NA NA 580 278
4 571 334 596 303 816 289 483 371
')
cbind(stack(data, c(measurement1, measurement2, measurement3, measurement4)),
stack(data, c(Tdays1, Tdays2, Tdays3, Tdays4)))
Which keeps measurements and Tdays neatly together but leaves us without pid which we can add using rep to replicate the original pid 4 times:
result <- cbind(pid = rep(data$pid, 4),
stack(data, c(measurement1, measurement2, measurement3, measurement4)),
stack(data, c(Tdays1, Tdays2, Tdays3, Tdays4)))
The head of which looks like
> head(result)
pid values ind values ind
1 1 1356 measurement1 1435 Tdays1
2 2 943 measurement1 1848 Tdays1
3 3 1590 measurement1 185 Tdays1
4 4 130 measurement1 72 Tdays1
5 4 140 measurement1 82 Tdays1
6 4 220 measurement1 126 Tdays1
As I said above, this is not the order you expected and you can try to sort this data.frame, if that is of any concern:
result <- result[order(result$pid), c(1, 4, 2)]
names(result) <- c("pid", "Time", "Value")
leading to the final result
> head(result)
pid Time Value
1 1 1435 1356
13 1 1405 1483
25 1 1374 1563
37 1 NA NA
2 2 1848 943
14 2 1818 1173
tidyverse solution
library(tidyverse)
dw %>%
pivot_longer(-PID) %>%
mutate(name = gsub('^([A-Za-z]+)(\\d+)$', '\\1_\\2', name )) %>%
separate(name, into = c('A', 'B'), sep = '_', convert = T) %>%
pivot_wider(names_from = A, values_from = value)
Gives the following output
# A tibble: 9 x 4
PID B T measurement
<int> <int> <int> <int>
1 1 1 1 100
2 1 2 4 200
3 1 3 7 50
4 2 1 2 150
5 2 2 5 300
6 2 3 8 60
7 3 1 3 120
8 3 2 6 210
9 3 3 9 70
Considering a dataframe, df like the following:
PID T1 measurement1 T2 measurement2 T3 measurement3
1 1 100 4 200 7 50
2 2 150 5 300 8 60
3 3 120 6 210 9 70
You can use this solution to get your required dataframe:
iters = seq(from = 4, to = length(colnames(df))-1, by = 2)
finalDf = df[, c(1,2,3)]
for(j in iters){
tobind = df[, c(1,j,j+1)]
finalDf = rbind(finalDf, tobind)
}
finalDf = finalDf[order(finalDf[,1]),]
print(finalDf)
The output of the print statement is this:
PID T1 measurement1
1 1 1 100
4 1 4 200
7 1 7 50
2 2 2 150
5 2 5 300
8 2 8 60
3 3 3 120
6 3 6 210
9 3 9 70
Maybe you can try reshape like below
reshape(
setNames(data, gsub("(\\d+)$", "\\.\\1", names(data))),
direction = "long",
varying = 2:ncol(data)
)

How to sum the change that happens during a specific date range?

df <- data.frame("Date"=seq(as.Date("2020/1/1"),by="day", length.out = 20),events=sample(0:100,20))
trying to sum the closest
df <- df %>% mutate(seven_sum=sum(events[Date <= Date & Date > Date-7]) )
Then i want to sum everyting that has happend during the last 7 days and I can understand why this is not working but not really how to solve it.
So basically i would like to for each row sum that date and all other within 7 days. it is wasy to fix if I use a fixed date range but i would like to to change for each row...
Any advice on how to continue would be very helpfull.
Using purrr::map_int :
library(dplyr)
library(purrr)
df %>% mutate(seven_sum=map_int(Date, ~sum(events[Date <= .x & Date > (.x-7)])))
# Date events seven_sum
#1 2020-01-01 66 66
#2 2020-01-02 94 160
#3 2020-01-03 49 209
#4 2020-01-04 39 248
#5 2020-01-05 84 332
#6 2020-01-06 29 361
#7 2020-01-07 36 397
#8 2020-01-08 20 351
#9 2020-01-09 40 297
#10 2020-01-10 25 273
#11 2020-01-11 3 237
#12 2020-01-12 97 250
#13 2020-01-13 22 243
#14 2020-01-14 63 270
#15 2020-01-15 58 308
#16 2020-01-16 91 359
#17 2020-01-17 26 360
#18 2020-01-18 47 404
#19 2020-01-19 35 342
#20 2020-01-20 38 358
and same logic in base R :
sapply(df$Date, function(x) sum(df$events[df$Date <= x & df$Date > (x-7)]))
We can use data.table methods to do a non-equi join which would be more efficient
library(data.table)
v1 <- setDT(df)[df[, Date1 := Date - 7], sum(events),
on = .(Date <= Date, Date > Date1), allow.cartesian =TRUE, by = .EACHI]$V1
df[, seven_sum := v1][]

Checking the value from given threshold in a set of observation and continue till end of vector

Task:
I have to check that if the value in the data vector is above from the given threshold,
If in my data vector, I found 5 consecutive values greater then the given threshold then I keep these values as they are.
If I have less then 5 values (not 5 consecutive values) then I will replace these values with NA's.
The sample data and required output is shown below. In this example the threshold value is 1000. X is input data variable and the desired output is: Y = X(Threshold > 1000)
X Y
580 580
457 457
980 980
1250 NA
3600 NA
598 598
1200 1200
1345 1345
9658 9658
1253 1253
4500 4500
1150 1150
596 596
594 594
550 550
1450 NA
320 320
1780 NA
592 592
590 590
I have used the following code in R for my desired output but unable to get the appropriate one:
for (i in 1:nrow(X)) # X is my data vector
{counter=0
if (X[i]>10000)
{
for (j in i:(i+4))
{
if (X[j]>10000)
{counter=counter+1}
}
ifelse (counter < 5, NA, X[j])
}
X[i]<- NA
}
X
I am sure that the above code contain some error. I need help in the form of either a new code or modifying this code or any package in R.
Here is an approach using dplyr, using a cumulative sum of diff(x > 1000) to group the values.
library(dplyr)
df <- data.frame(x)
df
# x
# 1 580
# 2 457
# 3 980
# 4 1250
# 5 3600
# 6 598
# 7 1200
# 8 1345
# 9 9658
# 10 1253
# 11 4500
# 12 1150
# 13 596
# 14 594
# 15 550
# 16 1450
# 17 320
# 18 1780
# 19 592
# 20 590
df %>% mutate(group = cumsum(c(0, abs(diff(x>1000))))) %>%
group_by(group) %>%
mutate(count = n()) %>%
ungroup() %>%
mutate(y = ifelse(x<1000 | count > 5, x, NA))
# x group count y
# (int) (dbl) (int) (int)
# 1 580 0 3 580
# 2 457 0 3 457
# 3 980 0 3 980
# 4 1250 1 2 NA
# 5 3600 1 2 NA
# 6 598 2 1 598
# 7 1200 3 6 1200
# 8 1345 3 6 1345
# 9 9658 3 6 9658
# 10 1253 3 6 1253
# 11 4500 3 6 4500
# 12 1150 3 6 1150
# 13 596 4 3 596
# 14 594 4 3 594
# 15 550 4 3 550
# 16 1450 5 1 NA
# 17 320 6 1 320
# 18 1780 7 1 NA
# 19 592 8 2 592
# 20 590 8 2 590
Another approach :
Y<-rep(NA,nrow(X))
for (i in 1:nrow(X)) {
if (X[i,1]<1000) {Y[i]<-X[i,1]} else if (sum(X[i:min((i+4),nrow(X)),1]>1000)>=5) {
Y[i:min((i+4),nrow(X))]<-X[i:min((i+4),nrow(X)),1]}
}
returns
> Y
[1] 580 457 980 NA NA 598 1200 1345 9658 1253 4500 1150 596 594 550 NA 320 NA 592 590
This assumes that the values of X are in the first column of a dataframe named X.
It then creates Y with NAand only change the values if the criteria are met.

Summarizing a data frame

I am trying to take the following data, and then uses this data to create a table which has the information broken down by state.
Here's the data:
> head(mydf2, 10)
lead_id buyer_account_id amount state
1 52055267 62 300 CA
2 52055267 64 264 CA
3 52055305 64 152 CA
4 52057682 62 75 NJ
5 52060519 62 750 OR
6 52060519 64 574 OR
15 52065951 64 152 TN
17 52066749 62 600 CO
18 52062751 64 167 OR
20 52071186 64 925 MN
I've allready subset the states that I'm interested in and have just the data I'm interested in:
mydf2 = subset(mydf, state %in% c("NV","AL","OR","CO","TN","SC","MN","NJ","KY","CA"))
Here's an idea of what I'm looking for:
State Amount Count
NV 1 50
NV 2 35
NV 3 20
NV 4 15
AL 1 10
AL 2 6
AL 3 4
AL 4 1
...
For each state, I'm trying to find a count for each amount "level." I don't necessary need to group the amount variable, but keep in mind that they are are not just 1,2,3, etc
> mydf$amount
[1] 300 264 152 75 750 574 113 152 750 152 675 489 188 263 152 152 600 167 34 925 375 156 675 152 488 204 152 152
[29] 600 489 488 75 152 152 489 222 563 215 452 152 152 75 100 113 152 150 152 150 152 452 150 152 152 225 600 620
[57] 113 152 150 152 152 152 152 152 152 152 640 236 152 480 152 152 200 152 560 152 240 222 152 152 120 257 152 400
Is there an elegant solution for this in R for this or will I be stuck using Excel (yuck!).
Here's my understanding of what you're trying to do:
Start with a simple data.frame with 26 states and amounts only ranging from 1 to 50 (which is much more restrictive than what you have in your example, where the range is much higher).
set.seed(1)
mydf <- data.frame(
state = sample(letters, 500, replace = TRUE),
amount = sample(1:50, 500, replace = TRUE)
)
head(mydf)
# state amount
# 1 g 28
# 2 j 35
# 3 o 33
# 4 x 34
# 5 f 24
# 6 x 49
Here's some straightforward tabulation. I've also removed any instances where frequency equals zero, and I've reordered the output by state.
temp1 <- data.frame(table(mydf$state, mydf$amount))
temp1 <- temp1[!temp1$Freq == 0, ]
head(temp1[order(temp1$Var1), ])
# Var1 Var2 Freq
# 79 a 4 1
# 157 a 7 2
# 391 a 16 1
# 417 a 17 1
# 521 a 21 1
# 1041 a 41 1
dim(temp1) # How many rows/cols
# [1] 410 3
Here's a little bit different tabulation. We are tabulating after grouping the "amount" values. Here, I've manually specified the breaks, but you could just as easily let R decide what it thinks is best.
temp2 <- data.frame(table(mydf$state,
cut(mydf$amount,
breaks = c(0, 12.5, 25, 37.5, 50),
include.lowest = TRUE)))
temp2 <- temp2[!temp2$Freq == 0, ]
head(temp2[order(temp2$Var1), ])
# Var1 Var2 Freq
# 1 a [0,12.5] 3
# 27 a (12.5,25] 3
# 79 a (37.5,50] 3
# 2 b [0,12.5] 2
# 28 b (12.5,25] 6
# 54 b (25,37.5] 5
dim(temp2)
# [1] 103 3
I am not sure if I understand correctly (you have two data.frames mydf and mydf2). I'll assume your data is in mydf. Using aggregate:
mydf$count <- 1:nrow(mydf)
aggregate(data = mydf, count ~ amount + state, length)
Is this what you are looking for?
Note: here count is a variable that is created just to get directly the output of the 3rd column as count.
Alternatives with ddply from plyr:
# no need to create a variable called count
ddply(mydf, .(state, amount), summarise, count=length(lead_id))
Here' one could use any column that exists in one's data instead of lead_id. Even state:
ddply(mydf, .(state, amount), summarise, count=length(state))
Or equivalently without using summarise:
ddply(mydf, .(state, amount), function(x) c(count=nrow(x)))

Resources