Related
Let's say I have this dataframe:
ID X1 X2
1 1 2
2 2 1
3 3 1
4 4 1
5 5 5
6 6 20
7 7 20
8 9 20
9 10 20
dataset <- structure(list(ID = 1:9, X1 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L,
10L), X2 = c(2L, 1L, 1L, 1L, 5L, 20L, 20L, 20L, 20L)),
class = "data.frame", row.names = c(NA,
-9L))
And I want to select rows in which the absolute value of the subtraction of rows are more or equal to 2 (based on columns X1 and X2).
For example, row 4 value is 4-1, which is 3 and should be selected.
Row 9 value is 10-20, which is -10. Absolute value is 10 and should be selected.
In this case it would be rows 3, 4, 6, 7, 8 and 9
I tried:
dataset2 = dataset[,abs(dataset- c(dataset[,2])) > 2]
But I get an error.
The operation:
abs(dataset- c(dataset[,2])) > 2
Does give me rows that the sum are more than 2, but the result only works for my second column and does not select properly
We can get the difference between the 'X1', 'X2' columns, create a logical expression in subset to subset the rows
subset(dataset, abs(X1 - X2) >= 2)
# ID X1 X2
#3 3 3 1
#4 4 4 1
#6 6 6 20
#7 7 7 20
#8 8 9 20
#9 9 10 20
Or using index
subset(dataset, abs(dataset[[2]] - dataset[[3]]) >= 2)
data
dataset <- structure(list(ID = 1:9, X1 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L,
10L), X2 = c(2L, 1L, 1L, 1L, 5L, 20L, 20L, 20L, 20L)),
class = "data.frame", row.names = c(NA,
-9L))
Hi this is an excel form of data i want to be able to create in R
Just want to make it clear, I need to be able to make the column Group_fix equal to 5 for the next 12 months period observation, every time an observation date has 5 in its Group column.
How to make it possible in R? Can we use ifelse function?
Here is an approach with lag from dplyr.
library(dplyr)
data %>%
mutate(GroupFix = case_when(Group == 5 |
lag(Group,2) == 5 |
lag(Group,2) == 5 |
lag(Group,3) == 5 |
lag(Group,4) == 5 |
lag(Group,5) == 5 |
lag(Group,6) == 5 |
lag(Group,7) == 5 |
lag(Group,8) == 5 |
lag(Group,9) == 5 |
lag(Group,10) == 5 |
lag(Group,11) == 5 ~ 5,
TRUE ~ as.numeric(Group)))
Observation.Date Group GroupFix
1 12/31/19 1 1
2 1/31/20 2 2
3 2/29/20 2 2
4 3/31/20 2 2
5 4/30/20 3 3
6 5/31/20 4 4
7 6/30/20 5 5
8 7/31/20 5 5
9 8/31/20 4 5
10 9/30/20 3 5
11 10/31/20 2 5
12 11/30/20 3 5
13 12/31/20 4 5
14 1/31/21 5 5
15 2/28/21 5 5
16 3/31/21 4 5
17 4/30/21 3 5
18 5/31/21 2 5
19 6/30/21 1 5
20 7/31/21 1 5
21 8/31/21 1 5
22 9/30/21 1 5
23 10/31/21 1 5
24 11/30/21 1 5
25 12/31/21 1 5
26 1/31/22 1 5
27 2/28/22 1 1
Data
data <- structure(list(Observation.Date = structure(c(8L, 1L, 13L, 14L,
16L, 18L, 20L, 22L, 24L, 26L, 4L, 6L, 9L, 2L, 11L, 15L, 17L,
19L, 21L, 23L, 25L, 27L, 5L, 7L, 10L, 3L, 12L), .Label = c("1/31/20",
"1/31/21", "1/31/22", "10/31/20", "10/31/21", "11/30/20", "11/30/21",
"12/31/19", "12/31/20", "12/31/21", "2/28/21", "2/28/22", "2/29/20",
"3/31/20", "3/31/21", "4/30/20", "4/30/21", "5/31/20", "5/31/21",
"6/30/20", "6/30/21", "7/31/20", "7/31/21", "8/31/20", "8/31/21",
"9/30/20", "9/30/21"), class = "factor"), Group = c(1L, 2L, 2L,
2L, 3L, 4L, 5L, 5L, 4L, 3L, 2L, 3L, 4L, 5L, 5L, 4L, 3L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-27L))
I have a dataframe in the following form:
person currentTest beforeValue afterValue
1 1 A 1.284297055 2.671763513
2 2 A -0.618359548 -2.354926905
3 3 A 0.039457430 -0.091709968
4 4 A -0.448608324 -0.362851832
5 5 A -0.961777124 -1.416284339
6 6 A 0.702471895 2.052181444
7 7 A -0.455222045 -2.125684279
8 8 A -1.231549132 -2.777425148
9 9 A -0.797234990 -0.558306183
10 10 A -0.709734963 -1.244159550
11 1 B -0.472799377 -0.869472343
12 2 B 0.059720737 1.444855389
13 3 B 0.924201532 2.731049485
14 4 B 0.658884183 1.017542475
15 5 B -1.989807256 -4.712671740
16 6 B 0.660241305 1.971232718
17 7 B 0.089636952 -0.564457911
18 8 B -0.828399941 0.507659171
19 9 B -0.838074237 -0.316996942
20 10 B -1.659197101 -3.317623686
...
What I'd like is to get a data frame of:
person A_Before A_After B_Before, B_After, ...
1 1.284297055 2.671763513 -0.472799377 -0.869472343
2 -0.618359548 -2.354926905 0.059720737 1.444855389
...
I've tried gather and spread but that's not quite what I need as there's the creation of new columns. Any suggestions?
The dput version for easy access is below:
resultsData <- dput(resultsData)
structure(list(person = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L), currentTest = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("A", "B", "C",
"D", "E", "F"), class = "factor"), beforeValue = c(1.28429705541541,
-0.618359548370402, 0.039457429902531, -0.448608324038257, -0.961777123997687,
0.702471895259405, -0.455222044740939, -1.23154913153736, -0.797234989892673,
-0.709734963076803, -0.47279937661921, 0.0597207367403981, 0.924201531911827,
0.658884182599422, -1.98980725637449, 0.660241304554785, 0.0896369516528346,
-0.828399941497236, -0.838074236572976, -1.65919710134782, 0.577469369909437,
1.92748171699512, -0.245593641496638, 0.126104785456265, -0.559338325961641,
1.29802115505785, 0.719406692531958, 0.969414499181256, -0.814697072724845,
0.86465983690719, -0.709539159817187, 1.02775240926492, -0.50490096148732,
0.40769259465753, -0.868531009656408, 0.949518511358715, 2.32458579520932,
-0.257578702370506, -0.789761851618986, 0.0979274657020477, -0.00803566278013502,
1.42984177159549, 1.45485678109231, -0.956556613290905, 0.443323691839299,
-0.261951072972966, -1.30990441429799, 0.0921741874883992, -1.02612779569131,
0.81550719514697, -0.403037731404182, -0.384422139459082, 0.417074857491798,
-1.37128032791855, -0.0796160137501127, 1.35302483988882, -0.752751140138746,
0.812453275384099, -1.32443072805549, -1.66986584340583), afterValue = c(2.67176351335094,
-2.35492690509713, -0.0917099675669388, -0.362851831626841, -1.4162843393352,
2.05218144382074, -2.12568427901904, -2.77742514848958, -0.558306182843248,
-1.24415954975022, -0.869472343362331, 1.44485538931333, 2.73104948477609,
1.01754247530805, -4.71267174035743, 1.9712327179732, -0.564457911016569,
0.507659170771878, -0.31699694238194, -3.31762368638082, 1.09068172988414,
4.37537723545199, -0.116850493406969, 1.9533832597394, -1.69003563933244,
2.62250581307257, -0.00837379068728961, 1.84192937988371, -0.675899868505659,
2.08506660046288, -0.583526785879512, 0.699298693972492, -1.26172199141024,
1.23589313451783, -1.56008919968504, 0.436686458587792, 0.11699090169902,
-1.07206510594109, 1.21204947218164, -0.812406581646911, 0.50373332256566,
-0.084945367568491, -0.236015748624917, -0.479606239480476, -0.596799139055039,
-0.562575023441403, -0.339935276865152, -0.213813544612318, -0.265296303857373,
-1.12545083569158, 0.0105156062602101, 0.635695183644557, 0.767433440961415,
0.16648012185356, 0.544633089427927, -0.904001384160196, -0.429299134808951,
0.764224744168297, -0.166062348771635, -0.101892580202475)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -60L), .Names = c("person",
"currentTest", "beforeValue", "afterValue"))
We can use dcast from reshape2
library(reshape2)
meltdf <- melt(resultsData, id.vars=1:2)
dcast(meltdf, person ~ currentTest + variable)
> dcast(meltdf, person ~ currentTest + variable)
person A_beforeValue A_afterValue B_beforeValue B_afterValue C_beforeValue C_afterValue D_beforeValue D_afterValue E_beforeValue
1 1 1.28429706 2.67176351 -0.47279938 -0.8694723 0.5774694 1.090681730 -0.70953916 -0.5835268 -0.008035663
2 2 -0.61835955 -2.35492691 0.05972074 1.4448554 1.9274817 4.375377235 1.02775241 0.6992987 1.429841772
3 3 0.03945743 -0.09170997 0.92420153 2.7310495 -0.2455936 -0.116850493 -0.50490096 -1.2617220 1.454856781
4 4 -0.44860832 -0.36285183 0.65888418 1.0175425 0.1261048 1.953383260 0.40769259 1.2358931 -0.956556613
5 5 -0.96177712 -1.41628434 -1.98980726 -4.7126717 -0.5593383 -1.690035639 -0.86853101 -1.5600892 0.443323692
6 6 0.70247190 2.05218144 0.66024130 1.9712327 1.2980212 2.622505813 0.94951851 0.4366865 -0.261951073
7 7 -0.45522204 -2.12568428 0.08963695 -0.5644579 0.7194067 -0.008373791 2.32458580 0.1169909 -1.309904414
8 8 -1.23154913 -2.77742515 -0.82839994 0.5076592 0.9694145 1.841929380 -0.25757870 -1.0720651 0.092174187
9 9 -0.79723499 -0.55830618 -0.83807424 -0.3169969 -0.8146971 -0.675899869 -0.78976185 1.2120495 -1.026127796
10 10 -0.70973496 -1.24415955 -1.65919710 -3.3176237 0.8646598 2.085066600 0.09792747 -0.8124066 0.815507195
E_afterValue F_beforeValue F_afterValue
1 0.50373332 -0.40303773 0.01051561
2 -0.08494537 -0.38442214 0.63569518
3 -0.23601575 0.41707486 0.76743344
4 -0.47960624 -1.37128033 0.16648012
5 -0.59679914 -0.07961601 0.54463309
6 -0.56257502 1.35302484 -0.90400138
7 -0.33993528 -0.75275114 -0.42929913
8 -0.21381354 0.81245328 0.76422474
9 -0.26529630 -1.32443073 -0.16606235
10 -1.12545084 -1.66986584 -0.10189258
You can use a combined gather + spread approach; Gather the *Values columns and combine with currentTest to form the new header, then spread to wide format:
resultsData %>%
gather(key, value, -person, -currentTest) %>%
unite(header, c('currentTest', 'key'), sep = "_") %>%
spread(header, value)
# A tibble: 10 x 13
# person A_afterValue A_beforeValue B_afterValue B_beforeValue C_afterValue C_beforeValue
# * <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2.67176351 1.28429706 -0.8694723 -0.47279938 1.090681730 0.5774694
# 2 2 -2.35492691 -0.61835955 1.4448554 0.05972074 4.375377235 1.9274817
# 3 3 -0.09170997 0.03945743 2.7310495 0.92420153 -0.116850493 -0.2455936
# 4 4 -0.36285183 -0.44860832 1.0175425 0.65888418 1.953383260 0.1261048
# 5 5 -1.41628434 -0.96177712 -4.7126717 -1.98980726 -1.690035639 -0.5593383
# 6 6 2.05218144 0.70247190 1.9712327 0.66024130 2.622505813 1.2980212
# 7 7 -2.12568428 -0.45522204 -0.5644579 0.08963695 -0.008373791 0.7194067
# 8 8 -2.77742515 -1.23154913 0.5076592 -0.82839994 1.841929380 0.9694145
# 9 9 -0.55830618 -0.79723499 -0.3169969 -0.83807424 -0.675899869 -0.8146971
#10 10 -1.24415955 -0.70973496 -3.3176237 -1.65919710 2.085066600 0.8646598
# ... with 6 more variables: D_afterValue <dbl>, D_beforeValue <dbl>, E_afterValue <dbl>,
# E_beforeValue <dbl>, F_afterValue <dbl>, F_beforeValue <dbl>
If you need to rename the columns:
resultsData %>%
gather(key, value, -person, -currentTest) %>%
unite(header, c('currentTest', 'key'), sep = "_") %>%
spread(header, value) %>%
rename_at(vars(matches("Value$")), funs(gsub("Value$", "", .)))
We could do this in a single line using recast
reshape2::recast(resultsData, person ~currentTest + variable, id.var = 1:2)
#person A_beforeValue A_afterValue B_beforeValue B_afterValue C_beforeValue C_afterValue D_beforeValue D_afterValue
#1 1 1.28429706 2.67176351 -0.47279938 -0.8694723 0.5774694 1.090681730 -0.70953916 -0.5835268
#2 2 -0.61835955 -2.35492691 0.05972074 1.4448554 1.9274817 4.375377235 1.02775241 0.6992987
#3 3 0.03945743 -0.09170997 0.92420153 2.7310495 -0.2455936 -0.116850493 -0.50490096 -1.2617220
#4 4 -0.44860832 -0.36285183 0.65888418 1.0175425 0.1261048 1.953383260 0.40769259 1.2358931
#5 5 -0.96177712 -1.41628434 -1.98980726 -4.7126717 -0.5593383 -1.690035639 -0.86853101 -1.5600892
#6 6 0.70247190 2.05218144 0.66024130 1.9712327 1.2980212 2.622505813 0.94951851 0.4366865
#7 7 -0.45522204 -2.12568428 0.08963695 -0.5644579 0.7194067 -0.008373791 2.32458580 0.1169909
#8 8 -1.23154913 -2.77742515 -0.82839994 0.5076592 0.9694145 1.841929380 -0.25757870 -1.0720651
#9 9 -0.79723499 -0.55830618 -0.83807424 -0.3169969 -0.8146971 -0.675899869 -0.78976185 1.2120495
#10 10 -0.70973496 -1.24415955 -1.65919710 -3.3176237 0.8646598 2.085066600 0.09792747 -0.8124066
# E_beforeValue E_afterValue F_beforeValue F_afterValue
#1 -0.008035663 0.50373332 -0.40303773 0.01051561
#2 1.429841772 -0.08494537 -0.38442214 0.63569518
#3 1.454856781 -0.23601575 0.41707486 0.76743344
#4 -0.956556613 -0.47960624 -1.37128033 0.16648012
#5 0.443323692 -0.59679914 -0.07961601 0.54463309
#6 -0.261951073 -0.56257502 1.35302484 -0.90400138
#7 -1.309904414 -0.33993528 -0.75275114 -0.42929913
#8 0.092174187 -0.21381354 0.81245328 0.76422474
#9 -1.026127796 -0.26529630 -1.32443073 -0.16606235
#10 0.815507195 -1.12545084 -1.66986584 -0.10189258
My data looks like this:
Group Feature_A Feature_B Feature_C Feature_D
1 1 0 3 2 4
2 1 5 2 2 8
3 1 9 8 6 5
4 2 5 7 8 8
5 2 2 6 8 1
6 2 3 8 6 4
7 3 1 5 3 5
8 3 1 4 3 4
df <- structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), Feature_A = c(0L,
5L, 9L, 5L, 2L, 3L, 1L, 1L), Feature_B = c(3L, 2L, 8L, 7L, 6L,
8L, 5L, 4L), Feature_C = c(2L, 2L, 6L, 8L, 8L, 6L, 3L, 3L), Feature_D = c(4L,
8L, 5L, 8L, 1L, 4L, 5L, 4L)), .Names = c("Group", "Feature_A",
"Feature_B", "Feature_C", "Feature_D"), class = "data.frame", row.names = c(NA,
-8L))
For every Feature I want to generate a plot (e.g., boxplot) that would higlight difference between Groups.
# Get unique Feature and Group
Features<-unique(colnames(df[,-1]))
Group<-unique(colnames(df$Group))
But how can I do the rest?
Pseudo-code might look like this:
Select Feature from Data
Split Data according Group
Boxplot
for (i in 1:levels(df$Features)){
for (o in 1:length(Group)){
}}
How can I achieve this? Hope someone can help me.
I would put py data in the long format. Then Using ggplot2 you can do some nice things.
library(reshape2)
library(ggplot2)
library(gridExtra)
## long format using Group as id
dat.m <- melt(dat,id='Group')
## bar plot
p1 <- ggplot(dat.m) +
geom_bar(aes(x=Group,y=value,fill=variable),stat='identity')
## box plot
p2 <- ggplot(dat.m) +
geom_boxplot(aes(x=factor(Group),y=value,fill=variable))
## aggregate the 2 plots
grid.arrange(p1,p2)
This is easy to do. I do this all the time
The code below will generate the charts using ggplot and save them as ch_Feature_A ....
you can wrap the answer in a pdf statement to send them to pdf as well
library(ggplot2)
df$Group <- as.factor(df$Group)
for (i in 2:dim(df)[2]) {
ch <- ggplot(df,aes_string(x="Group",y=names(df)[i],fill="Group"))+geom_boxplot()
assign(paste0("ch_",names(df)[i]),ch)
}
or even simpler, if you do not want separate charts
library(reshape2)
df1 <- melt(df)
ggplot(df1,aes(x=Group,y=value,fill=Group))+geom_boxplot()+facet_grid(.~variable)
I have a dataframe like this:
id col1
1 1 1
2 2 2
3 3 3
4 4 4
5 5 1
6 1 2
7 2 3
8 3 4
I would like to group by id's then create a string that contains the values in col1 separated by a space and in descending value.
I first order the data frame by id and col1 but am unable to get the output from ddply as a string with no quotes.
df111 <- df111[order(df111$id, -df111$col1),]
df222 <- ddply(df111, .(id), function(col1) as.character(paste0(col1,sep = ' ')))
id V1 V2
1 1 c(1, 1, 1, 1) c(0.793507214868441, 0.539258575299755, 0.165128685068339, 0.153290810529143)
2 2 c(2, 2, 2, 2) c(0.872032727580518, 0.827515688957646, 0.236087603960186, 0.165240615839139)
3 3 c(3, 3, 3, 3) c(0.759382889838889, 0.484359077410772, 0.182580581633374, 0.0723447729833424)
4 4 c(4, 4, 4, 4) c(0.874859027564526, 0.642130059422925, 0.0569298807531595, 0.0227038362063468)
5 5 c(5, 5, 5, 5) c(0.392553070792928, 0.386064056074247, 0.299609177513048, 0.222290486795828)
I'd like some thing like this:
id V1
1 1 .793507214868441 0.539258575299755 0.165128685068339 0.153290810529143
Any suggestions?
EDIT:
> dput(df111)
structure(list(id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), col1 = c(0.793507214868441,
0.539258575299755, 0.165128685068339, 0.153290810529143, 0.872032727580518,
0.827515688957646, 0.236087603960186, 0.165240615839139, 0.759382889838889,
0.484359077410772, 0.182580581633374, 0.0723447729833424, 0.874859027564526,
0.642130059422925, 0.0569298807531595, 0.0227038362063468, 0.392553070792928,
0.386064056074247, 0.299609177513048, 0.222290486795828)), .Names = c("id",
"col1"), row.names = c(1L, 11L, 16L, 6L, 7L, 12L, 17L, 2L, 18L,
13L, 8L, 3L, 14L, 9L, 19L, 4L, 20L, 10L, 5L, 15L), class = "data.frame")
I think maybe you just need to use summarise rather than a custom anonymous function...?
dat <- read.table(text = "id col1
1 1 1
2 2 2
3 3 3
4 4 4
5 5 1
6 1 2
7 2 3
8 3 4",header = TRUE,sep = "")
> ddply(dat,.(id),summarise,val = paste(sort(col1,decreasing = TRUE),collapse = " "))
id val
1 1 2 1
2 2 3 2
3 3 4 3
4 4 4
5 5 1