Re-align column in data frame into multiple columns - r

I'm trying to change a data frame column (var3, in the example below) that has multiple values for factor levels of another variable (names, in the example below). I'd like var3 to be split into separate columns, one for each value, so that the factor levels in names do not repeat. My other variables (var1, var2) repeat where necessary to provide space for var3.
This is the kind of data I have:
df1 <- structure(list(name = structure(c(2L, 4L, 4L, 4L, 3L, 5L, 5L,
1L), .Label = c("fifth", "first", "fourth", "second", "third"
), class = "factor"), var1 = c(90L, 84L, 84L, 84L, 18L, 22L,
22L, 36L), var2 = c(301L, 336L, 336L, 336L, 412L, 296L, 296L,
357L), var3 = c(-0.582075925, -1.108889624, -1.014962009, -0.162309524,
-0.282309524, 0.563055819, -0.232075925, -0.773353424)), .Names = c("name",
"var1", "var2", "var3"), class = "data.frame", row.names = c(NA, -8L))
This is what i'd like:
df2 <- structure(list(name = structure(c(2L, 4L, 3L, 5L, 1L), .Label = c("fifth",
"first", "fourth", "second", "third"), class = "factor"), var1 = c(90L,
84L, 18L, 22L, 36L), var2 = c(301L, 336L, 412L, 296L, 357L),
var3 = c(-0.582075925, -1.108889624, -0.282309524, 0.563055819,
-0.773353424), var3.2 = c(NA, -1.014962009, NA, -0.232075925,
NA), var3.3 = c(NA, -0.162309524, NA, NA, NA)), .Names = c("name", "var1",
"var2", "var3", "var3.2", "var3.3"), class = "data.frame", row.names = c(NA, -5L))
I've looked at reshape and ddply, but can't get them to give me this output.

Here's a base solution:
> df1$seqnam <- ave(as.character(df1$name), df1$name, FUN=seq) # creates a "time" index
> reshape(df1, direction="wide", timevar="seqnam", idvar=c("name", "var1", "var2") )
name var1 var2 var3.1 var3.2 var3.3
1 first 90 301 -0.5820759 NA NA
2 second 84 336 -1.1088896 -1.0149620 -0.1623095
5 fourth 18 412 -0.2823095 NA NA
6 third 22 296 0.5630558 -0.2320759 NA
8 fifth 36 357 -0.7733534 NA NA

ddply(df1, .(name), function(x) {
var3 <- data.frame(rbind(unique(x$var3)))
names(var3) <- paste0("var3.", 1:length(var3))
return(data.frame(name = unique(x$name), var1 = unique(x$var1),
var2 = unique(x$var2), var3))
})
name var1 var2 var3.1 var3.2 var3.3
1 fifth 36 357 -0.7733534 NA NA
2 first 90 301 -0.5820759 NA NA
3 fourth 18 412 -0.2823095 NA NA
4 second 84 336 -1.1088896 -1.0149620 -0.1623095
5 third 22 296 0.5630558 -0.2320759 NA
The function can be modified if you expect var1 and var2 to also contain multiple values.

Related

Replace all values in dataframe using another dataframe as key in R

I have two dataframes and I want to replace all values ( in all the columns) of df1 using the equivalent value in df2 (df2$value).
df1
structure(list(Cell_ID = c(7L, 2L, 3L, 10L), n_1 = c(0L, 0L,
0L, 0L), n_2 = c(9L, 1L, 4L, 1L), n_3 = c(10L, 4L, 5L, 2L), n_4 = c(NA,
5L, NA, 4L), n_5 = c(NA, 7L, NA, 6L), n_6 = c(NA, 9L, NA, 8L),
n_7 = c(NA, 10L, NA, 3L)), class = "data.frame", row.names = c(NA,
-4L))
df2
structure(list(Cell_ID = 0:10, value = c(5L, 100L, 200L, 300L,
400L, 500L, 600L, 700L, 800L, 900L, 1000L)), class = "data.frame", row.names = c(NA,
-11L))
The desired output would look like this:
So far I tried this as suggested in another similar post but its not doing it well (randomly missing some points)
key= df2$Cell_ID
value = df2$value
lapply(1:8,FUN = function(i){df1[df1 == key[i]] <<- value[i]})
Note that the numbers have been just multiplied by 10 for ease in the example the real data has numbers are all over the place so just multiplying the dataframe by 10 won't work.
An option is match the elements with the 'Cell_ID' of second dataset and use that as index to return the corresponding 'value' from 'df2'
library(dplyr)
df1 %>%
mutate(across(everything(), ~ df2$value[match(., df2$Cell_ID)]))
-output
# Cell_ID n_1 n_2 n_3 n_4 n_5 n_6 n_7
#1 700 5 900 1000 NA NA NA NA
#2 200 5 100 400 500 700 900 1000
#3 300 5 400 500 NA NA NA NA
#4 1000 5 100 200 400 600 800 300
Or another option is to use a named vector to do the match
library(tibble)
df1 %>%
mutate(across(everything(), ~ deframe(df2)[as.character(.)]))
The base R equivalent is
df1[] <- lapply(df1, function(x) df2$value[match(x, df2$Cell_ID)])

Replacing NA depending on distribution type of gender in R

When i selected NA value here
data[data=="na"] <- NA
data[!complete.cases(data),]
i must replace it, but depending on type of distribution.
If using Shapiro.test the distribution by variables not normal,
then missing value must be replace by median,
If it's normal, than replace by mean.
But distribution for each gender(1 girl, 2 -man)
data=structure(list(sex = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), emotion = c(20L,
15L, 49L, NA, 34L, 35L, 54L, 45L), IQ = c(101L, 98L, 105L, NA,
123L, 120L, 115L, NA)), .Names = c("sex", "emotion", "IQ"), class = "data.frame", row.names = c(NA,
-8L))
the desired output
sex emotion IQ
1 20 101
1 15 98
1 49 105
1 28 101
2 34 123
2 35 120
2 54 115
2 45 119
Following code will replace NA values according to the Shapiro Test:
library(dplyr)
data %>%
group_by(sex) %>%
mutate(
emotion = ifelse(!is.na(emotion), emotion,
ifelse(shapiro.test(emotion)$p.value > 0.05,
mean(emotion, na.rm=TRUE), quantile(emotion, na.rm=TRUE, probs=0.5) ) ),
IQ = ifelse(!is.na(IQ), IQ,
ifelse(shapiro.test(IQ)$p.value > 0.05,
mean(IQ, na.rm=TRUE), quantile(IQ, na.rm=TRUE, probs=0.5) )
)
)

Add value from previous row under conditions

I have a df data and I would like to add to a new column a value that exist in a previous column and row if the factor is the same.
Here is a sample:
data <- structure(list(Id = c("a", "b", "b", "b", "a", "a", "b", "b",
"a", "a"), duration.minutes = c(NA, 139L, 535L, 150L, NA, NA,
145L, 545L, 144L, NA), event = structure(c(1L, 4L, 3L, 4L, 2L,
1L, 4L, 3L, 4L, 2L), .Label = c("enter", "exit", "stop", "trip"
), class = "factor")), .Names = c("Id", "duration.minutes", "event"
), class = "data.frame", row.names = 265:274)
and I would like to add a new column called "duration.minutes.past" like this:
data <- structure(list(Id = c("a", "b", "b", "b", "a", "a", "b", "b",
"a", "a"), duration.minutes = c(NA, 139L, 535L, 150L, NA, NA,
145L, 545L, 144L, NA), event = structure(c(1L, 4L, 3L, 4L, 2L,
1L, 4L, 3L, 4L, 2L), .Label = c("enter", "exit", "stop", "trip"
), class = "factor"), duration.minutes.past = c(NA, NA, 139,
NA, NA, NA, NA, 145, NA, NA)), .Names = c("Id", "duration.minutes",
"event", "duration.minutes.past"), row.names = 265:274, class = "data.frame")
As you can see, I added in this new column duration.minutes.past the duration.minutes of the previous trip for the same Id. if the Id is different or if is it not a stop, then the value for duration.minutes.past is NA.
Help is much appreciated!
A possible solution using dplyr,
library(dplyr)
df %>%
group_by(Id) %>%
mutate(new = replace(lag(duration.minutes), event != 'stop', NA))
#Source: local data frame [10 x 4]
#Groups: Id [2]
# Id duration.minutes event new
# <chr> <int> <fctr> <int>
#1 a NA enter NA
#2 b 139 trip NA
#3 b 535 stop 139
#4 b 150 trip NA
#5 a NA exit NA
#6 a NA enter NA
#7 b 145 trip NA
#8 b 545 stop 145
#9 a 144 trip NA
#10 a NA exit NA
We can do this with data.table. Convert the 'data.frame' to 'data.table' (setDT(data)), grouped by 'Id', we create the lag column of 'duration.minutes' using shift), then change the value to 'NA' where the 'event' is not equal to 'stop'
library(data.table)
setDT(data)[, duration.minutes.past := shift(duration.minutes),
Id][event != "stop", duration.minutes.past := NA][]
data
# Id duration.minutes event duration.minutes.past
#1: a NA enter NA
#2: b 139 trip NA
#3: b 535 stop 139
#4: b 150 trip NA
#5: a NA exit NA
#6: a NA enter NA
#7: b 145 trip NA
#8: b 545 stop 145
#9: a 144 trip NA
#10: a NA exit NA
Or this can be done with base R using ave
data$duration.minutes.past <- with(data, NA^(event != "stop") *
ave(duration.minutes, Id, FUN = function(x) c(NA, x[-length(x)])))

Aggregate Function - Keep NAs in data.frame

I want to use the aggregation function of R to aggregate a Price on several fields. However, I also have NAs in my data, which I would like to keep.
Tried:
> dput(df)
structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 3L, 2L, 1L), REFERENCE = c("TEST1", "TEST2", "TEST3",
"TEST4", "TEST1", "TEST2", "TEST3", "TEST4", "TEST1", "TEST2",
"TEST3", "TEST4", "TEST1", "TEST2", "", "TEST2"), ISS = c(1234L,
1234L, 1111L, 1111L, 1234L, 1111L, 1234L, 1111L, 1234L, NA, 1234L,
1111L, 1234L, 1111L, 1234L, NA), Price = c(10L, NA, 20L, NA,
10L, 12L, NA, 99L, 100L, NA, 100L, 12L, NA, 11L, 0L, 12L)), .Names = c("ID",
"REFERENCE", "ISS", "Price"), row.names = c(NA, -16L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000100788>)
>
> df <- aggregate(df$Price, by=list(ID=df$ID, REFERENCE=df$REFERENCE, ISS=df$ISS), FUN=sum)
Setting na.action = na.pass, gives me:7
Error in aggregate.data.frame(as.data.frame(x), ...) :
no rows to aggregate
As a result I would like to have:
Hence, I would like to keep my NA Data in my df.
Any recommendation how to implement that?
I appreciate your replies!
Instead of using aggregate on a "data.table", we can use the data.table methods. We get the sum of Price (sum(Price, na.rm=TRUE)) after grouping by "ID/REFERENCE/ISS" (by=list(ID, REFERENCE, ISS)]. Order the output by "ID", "REFERENCE" (if needed)
library(data.table)
df[, sum(Price, na.rm=TRUE), by = list(ID, REFERENCE, ISS)][
order(ID, REFERENCE)]
# ID REFERENCE ISS V1
#1: 1 TEST1 1234 10
#2: 1 TEST2 1111 12
#3: 1 TEST2 NA 12
#4: 2 1234 0
#5: 2 TEST2 1234 0
#6: 2 TEST3 1234 100
#7: 3 TEST2 1111 11
#8: 3 TEST3 1111 20
#9: 3 TEST4 1111 111
#10: 4 TEST1 1234 110
#11: 4 TEST4 1111 0

reshaping data in R skipping certain measured variables

I would like to reshape a data.frame that looks like this:
permno dte ttm var1 var2 var3
1 123 2012-01-01 20 1 10 100
2 123 2012-01-01 30 -1 10 100
3 124 2012-01-01 20 2 20 200
4 124 2012-01-01 30 -2 20 200
I would like to make my data.frame look the following way:
permno dte var1_20 var1_30 var2 var3
1 123 2012-01-01 1 -1 10 100
2 124 2012-01-01 2 -2 20 200
I have been attempting to do this with reshape2 package but I am unable to isolate var1 from the rest and keep getting var2_20 and var2_30 for example in the results. Does anyone know how to do this using the reshape2 package?
data.frame dput:
> dput(DF)
structure(list(permno = c(123L, 123L, 124L, 124L), dte = structure(c(1L,
1L, 1L, 1L), .Label = " 2012-01-01", class = "factor"), ttm = c(20L,
30L, 20L, 30L), var1 = c(1L, -1L, 2L, -2L), var2 = c(10L, 10L,
20L, 20L), var3 = c(100L, 100L, 200L, 200L)), .Names = c("permno",
"dte", "ttm", "var1", "var2", "var3"), class = "data.frame", row.names = c(NA,
-4L))
> dput(result)
structure(list(permno = 123:124, dte = structure(c(1L, 1L), .Label = " 2012-01-01", class = "factor"),
var1_20 = 1:2, var1_30 = c(-1L, -2L), var2 = c(10L, 20L),
var3 = c(100L, 200L)), .Names = c("permno", "dte", "var1_20",
"var1_30", "var2", "var3"), class = "data.frame", row.names = c(NA,
-2L))
Use a combination of merge, reshape, and unique as follows:
unique(merge(DF[-c(3:4)],
reshape(DF[1:4], direction = "wide",
idvar = c("permno", "dte"),
timevar="ttm")))
# permno dte var2 var3 var1.20 var1.30
# 1 123 2012-01-01 10 100 1 -1
# 3 124 2012-01-01 20 200 2 -2
Basically, you reshape only the columns that need to be reshaped, and drop those columns from the original dataset before merging. You'll end up with duplicated rows, so just wrap all of that in unique to get (almost) your desired output. You can rearrange the column order if required.
I'm feeling rather clever about this answer, but I strongly suspect that I've made too many assumptions about your data, in particular the constant nature of var2 and var3:
ddply(dat,.(permno,dte,var2,var3),
function(x) { dcast(x,permno + dte + var2 + var3 ~ ttm,value.var = 'var1') })
permno dte var2 var3 20 30
1 123 2012-01-01 10 100 1 -1
2 124 2012-01-01 20 200 2 -2

Resources