Merge lines with same ID and take average value - r

From the table below I need to combine the lines by calculating the average value for those lines with same ID (column 2).
I was thinking of the plyr function??
ddply(df, summarize, value = average(ID))
df:
miRNA ID 100G 100R 106G 106R 122G 122R 124G 124R 126G 126R 134G 134R 141G 141R 167G 167R 185G 185R
1 hsa-miR-106a ID7 1585 423 180 113 598 266 227 242 70 106 2703 442 715 309 546 113 358 309
2 hsa-miR-1185-1 ID2 10 1 3 3 11 8 4 4 28 2 13 3 6 3 6 4 7 5
3 hsa-miR-1185-2 ID2 2 0 2 1 5 1 1 0 4 1 1 1 3 2 2 0 2 1
4 hsa-miR-1197 ID2 2 0 0 5 3 3 0 4 16 0 4 1 3 0 0 2 2 4
5 hsa-miR-127 ID3 29 17 6 55 40 35 6 20 171 10 32 21 23 25 10 14 32 55
Summary of original data:
> str(ClusterMatrix)
'data.frame': 113 obs. of 98 variables:
$ miRNA: Factor w/ 202 levels "hsa-miR-106a",..: 1 3 4 6 8 8 14 15 15 16 ...
$ ID : Factor w/ 27 levels "ID1","ID10","ID11",..: 25 12 12 12 21 21 12 21 21 6 ...
$ 100G : Factor w/ 308 levels "-0.307749042739963",..: 279 11 3 3 101 42 139 158 215 222 ...
$ 100R : Factor w/ 316 levels "-0.138028803567403",..: 207 7 8 8 18 42 128 183 232 209 ...
$ 106G : Factor w/ 260 levels "-0.103556709881933",..: 171 4 1 3 7 258 95 110 149 162 ...
$ 106R : Factor w/ 300 levels "-0.141810346640204",..: 141 4 6 2 108 41 146 196 244 267 ...
$ 122G : Factor w/ 336 levels "-0.0409548922061764",..: 237 12 4 6 103 47 148 203 257 264 ...
$ 122R : Factor w/ 316 levels "-0.135708706475279",..: 177 1 8 6 36 44 131 192 239 244 ...
$ 124G : Factor w/ 267 levels "-0.348439853247856",..: 210 5 2 3 7 50 126 138 188 249 ...
$ 124R : Factor w/ 303 levels "-0.176414190219115",..: 193 3 7 3 21 52 167 200 238 239 ...
$ 126G : Factor w/ 307 levels "-0.227658806811544",..: 122 88 5 76 169 61 240 220 281 265 ...
$ 126R : Factor w/ 249 levels "-0.271925865853123",..: 119 1 2 3 11 247 78 110 151 193 ...
$ 134G : Factor w/ 344 levels "-0.106333543799583",..: 304 14 8 5 33 48 150 196 248 231 ...
$ 134R : Factor w/ 300 levels "-0.0997616469801097",..: 183 5 7 7 22 298 113 159 213 221 ...
$ 141G : Factor w/ 335 levels "-0.134429748398679",..: 253 7 3 3 24 29 142 137 223 302 ...
$ 141R : Factor w/ 314 levels "-0.143299688877927",..: 210 4 5 7 98 54 154 199 255 251 ...
$ 167G : Factor w/ 306 levels "-0.211181452126958",..: 222 7 4 6 11 292 91 101 175 226 ...
$ 167R : Factor w/ 282 levels "-0.0490740880560127",..: 130 2 6 4 15 282 110 146 196 197 ...
$ 185G : Factor w/ 317 levels "-0.0567841338235346",..: 218 2 7 7 33 34 130 194 227 259 ...

We can use dplyr. We group by 'ID', use mutate_each to create columns that show the mean value of '100G' to '185R'. We select the columns in mutate_each by using regex patterns in matches. Then cbind (bind_cols) the original dataset with the mutated columns, and convert to data.frame if needed. We can also change the column names of the mean columns.
library(dplyr)
out <- df1 %>%
group_by(ID) %>%
mutate_each(funs(mean=mean(., na.rm=TRUE)), matches('^\\d+')) %>%
setNames(., c(names(.)[1:2], paste0('Mean_', names(.)[3:ncol(.)]))) %>%
as.data.frame()
out1 <- bind_cols(df1, out[-(1:2)])
out1
# miRNA ID 100G 100R 106G 106R 122G 122R 124G 124R 126G 126R 134G
#1 hsa-miR-106a ID7 1585 423 180 113 598 266 227 242 70 106 2703
#2 hsa-miR-1185-1 ID2 10 1 3 3 11 8 4 4 28 2 13
#3 hsa-miR-1185-2 ID2 2 0 2 1 5 1 1 0 4 1 1
#4 hsa-miR-1197 ID2 2 0 0 5 3 3 0 4 16 0 4
#5 hsa-miR-127 ID3 29 17 6 55 40 35 6 20 171 10 32
# 134R 141G 141R 167G 167R 185G 185R Mean_100G Mean_100R Mean_106G
#1 442 715 309 546 113 358 309 1585.000000 423.0000000 180.000000
#2 3 6 3 6 4 7 5 4.666667 0.3333333 1.666667
#3 1 3 2 2 0 2 1 4.666667 0.3333333 1.666667
#4 1 3 0 0 2 2 4 4.666667 0.3333333 1.666667
#5 21 23 25 10 14 32 55 29.000000 17.0000000 6.000000
# Mean_106R Mean_122G Mean_122R Mean_124G Mean_124R Mean_126G Mean_126R
#1 113 598.000000 266 227.000000 242.000000 70 106
#2 3 6.333333 4 1.666667 2.666667 16 1
#3 3 6.333333 4 1.666667 2.666667 16 1
#4 3 6.333333 4 1.666667 2.666667 16 1
#5 55 40.000000 35 6.000000 20.000000 171 10
# Mean_134G Mean_134R Mean_141G Mean_141R Mean_167G Mean_167R Mean_185G
#1 2703 442.000000 715 309.000000 546.000000 113 358.000000
#2 6 1.666667 4 1.666667 2.666667 2 3.666667
#3 6 1.666667 4 1.666667 2.666667 2 3.666667
#4 6 1.666667 4 1.666667 2.666667 2 3.666667
#5 32 21.000000 23 25.000000 10.000000 14 32.000000
# Mean_185R
#1 309.000000
#2 3.333333
#3 3.333333
#4 3.333333
#5 55.000000
EDIT: If we need a single row mean for each 'ID', we can use summarise_each
df1 %>%
group_by(ID) %>%
summarise_each(funs(mean=mean(., na.rm=TRUE)), matches('^\\d+'))
EDIT2: Based on the OP's update the original dataset ('ClusterMatrix') columns are all factor class. We need to convert the columns to numeric class before getting the mean. There are two options to convert the factor to numeric - 1) by as.numeric(as.character(.. which may be a bit slower, 2) as.numeric(levels(.. which is faster. Here I am using the first method as it may be more clear.
ClusterMatrix %>%
group_by(ID) %>%
summarise_each(funs(mean= mean(as.numeric(as.character(.)),
na.rm=TRUE)), matches('^\\d+'))
data
df1 <- structure(list(miRNA = c("hsa-miR-106a", "hsa-miR-1185-1",
"hsa-miR-1185-2",
"hsa-miR-1197", "hsa-miR-127"), ID = c("ID7", "ID2", "ID2", "ID2",
"ID3"), `100G` = c(1585L, 10L, 2L, 2L, 29L), `100R` = c(423L,
1L, 0L, 0L, 17L), `106G` = c(180L, 3L, 2L, 0L, 6L), `106R` = c(113L,
3L, 1L, 5L, 55L), `122G` = c(598L, 11L, 5L, 3L, 40L), `122R` = c(266L,
8L, 1L, 3L, 35L), `124G` = c(227L, 4L, 1L, 0L, 6L), `124R` = c(242L,
4L, 0L, 4L, 20L), `126G` = c(70L, 28L, 4L, 16L, 171L), `126R` = c(106L,
2L, 1L, 0L, 10L), `134G` = c(2703L, 13L, 1L, 4L, 32L), `134R` = c(442L,
3L, 1L, 1L, 21L), `141G` = c(715L, 6L, 3L, 3L, 23L), `141R` = c(309L,
3L, 2L, 0L, 25L), `167G` = c(546L, 6L, 2L, 0L, 10L), `167R` = c(113L,
4L, 0L, 2L, 14L), `185G` = c(358L, 7L, 2L, 2L, 32L), `185R` = c(309L,
5L, 1L, 4L, 55L)), .Names = c("miRNA", "ID", "100G", "100R",
"106G", "106R", "122G", "122R", "124G", "124R", "126G", "126R",
"134G", "134R", "141G", "141R", "167G", "167R", "185G", "185R"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5"
))

Related

data.table efficiently finding common pairs between 2 columns

say I have a dataframe
subject stim1 stim2 feedback
1 1003 50 51 1
2 1003 48 50 1
3 1003 49 51 1
4 1003 47 49 1
5 1003 47 46 1
6 1003 46 48 1
10 1003 50 48 1
428 1003 48 51 0
433 1003 46 50 0
434 1003 50 49 0
435 1003 54 59 0
I want to create a new column "transitive_pair" by
group by subject (column 1),
For every row in which feedback==0 (starting index 428, otherwise transitive_pair=NaN).
I want to return a boolean which tells me whether there is any chain of pairings (but only those in which feedback==1) that would transitively link stim1 and stim2 values.
Working out a few examples.
row 428- stim1=48 and stim2=51
48 and 51 are not paired but 51 was paired with 50 (e.g.row 1 ) and 50 was paired with 48 (row 10) so transitive_pair[428]=True
row 433- stim 1=46 and stim2=50
46 and 48 were paired (row 6) and 48 was paired with 50 (row 2) so transitive_pair[433]=True
in row 435, stim1=54, stim2=59
there is no chain of pairs that could link them (59 is not paired with anything while feedback==1) so transitive_pair[435]=False
desired output
subject stim1 stim2 feedback transitive_pair
1 1003 50 51 1 NaN
2 1003 48 50 1 NaN
3 1003 49 51 1 NaN
4 1003 47 49 1 NaN
5 1003 47 46 1 NaN
6 1003 46 48 1 NaN
10 1003 50 48 1 NaN
428 1003 48 51 0 1
433 1003 46 50 0 1
434 1003 50 49 0 1
435 1003 54 59 0 0
any help would be greatly appreciated!!
and putting a recreateble df here
structure(list(subject = c(1003L, 1003L, 1003L, 1003L, 1003L,
1003L, 1003L, 1003L, 1003L, 1003L, 1003L), stim1 = c(50L, 48L,
49L, 47L, 47L, 46L, 50L, 48L, 46L, 50L, 54L), stim2 = c(51L,
50L, 51L, 49L, 46L, 48L, 48L, 51L, 50L, 49L, 59L), feedback = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L), transitive_pair = c(NaN,
NaN, NaN, NaN, NaN, NaN, NaN, 1, 1, 1, 0)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 10L, 428L, 433L, 434L, 435L), class = "data.frame")
The columns "stim1" and "stim2" define an undirected graph. Create the graph for feedback == 1, get its connected components and for each row of the data.frame, check if the values of "stim1" and "stim2" belong to the same component. In the end assign NaN to the rows where feedback is 1.
suppressPackageStartupMessages(library(igraph))
inx <- df1$feedback == 1
g <- graph_from_data_frame(df1[inx, c("stim1", "stim2")], directed = FALSE)
plot(g)
g_comp <- components(g)$membership
df1$transitive_pair_2 <- apply(df1[c("stim1", "stim2")], 1, \(x) {
i <- names(g_comp) == x[1]
j <- names(g_comp) == x[2]
if(any(i) & any(j))
g_comp[i] == g_comp[j]
else 0L
})
df1$transitive_pair_2[inx] <- NaN
df1
#> subject stim1 stim2 feedback transitive_pair transitive_pair_2
#> 1 1003 50 51 1 NaN NaN
#> 2 1003 48 50 1 NaN NaN
#> 3 1003 49 51 1 NaN NaN
#> 4 1003 47 49 1 NaN NaN
#> 5 1003 47 46 1 NaN NaN
#> 6 1003 46 48 1 NaN NaN
#> 10 1003 50 48 1 NaN NaN
#> 428 1003 48 51 0 1 1
#> 433 1003 46 50 0 1 1
#> 434 1003 50 49 0 1 1
#> 435 1003 54 59 0 0 0
Created on 2022-07-31 by the reprex package (v2.0.1)

Filling in multiple columns of missing data from another dataset

I have a data set that contains some missing values which can be completed by merging with a another dataset. My example:
This is the updated data set I am working with.
DF1
Name Paper Book Mug soap computer tablet coffee coupons
1 2 3 4 5 6 7 8 9
2 21 22 23 23 23 7 23 9
3 56 57 58 59 60 7 62 9
4 80.33333 81.33333 82.33333 83 83.66667 7 85 9
5 107.3333 108.3333 109.3333 110 110.6667 7 112 9
6 134.3333 135.3333 136.3333 137 137.6667 7 139 9
7 161.3333 162.3333 163.3333 164 164.6667
8 188.3333 189.3333 190.3333 191 191.6667 7 193 9
9 215.3333 216.3333 217.3333 218 218.6667 7 220 9
10 242.3333 243.3333 244.3333 245 245.6667 7 247 9
11 269.3333 270.3333 271.3333 272 272.6667 7 274 9
12 296.3333 297.3333 298.3333 299 299.6667
13 323.3333 324.3333 325.3333 326 326.6667 7 328 9
14 350.3333 351.3333 352.3333 353 353.6667 7 355 9
15 377.3333 378.3333 379.3333 380 380.6667
16 404.3333 405.3333 406.3333 407 407.6667 7 409 9
17 431.3333 432.3333 433.3333 434 434.6667 7 436 9
18 458.3333 459.3333 460.3333 461 461.6667 7 463 9
19 485.3333 486.3333 487.3333 488 488.6667
DF2
Name Paper Book Mug soap computer tablet coffee coupons
7 161.3333 162.3333 163.3333 164 164.6667 6 6 6
12 296.3333 297.3333 298.3333 299 299.6667 88 96 25
15 377.3333 378.3333 379.3333 380 380.6667 88 62 25
19 485.3333 486.3333 487.3333 488 488.6667 88 88 78
I want to get:
Name Paper Book Mug soap computer tablet coffee coupons
1 2 3 4 5 6 7 8 9
2 21 22 23 23 23 7 23 9
3 56 57 58 59 60 7 62 9
4 80.33333 81.33333 82.33333 83 83.66667 7 85 9
5 107.3333 108.3333 109.3333 110 110.6667 7 112 9
6 134.3333 135.3333 136.3333 137 137.6667 7 139 9
7 161.3333 162.3333 163.3333 164 164.6667 6 6 6
8 188.3333 189.3333 190.3333 191 191.6667 7 193 9
9 215.3333 216.3333 217.3333 218 218.6667 7 220 9
10 242.3333 243.3333 244.3333 245 245.6667 7 247 9
11 269.3333 270.3333 271.3333 272 272.6667 7 274 9
12 296.3333 297.3333 298.3333 299 299.6667 88 96 25
13 323.3333 324.3333 325.3333 326 326.6667 7 328 9
14 350.3333 351.3333 352.3333 353 353.6667 7 355 9
15 377.3333 378.3333 379.3333 380 380.6667 88 62 25
16 404.3333 405.3333 406.3333 407 407.6667 7 409 9
17 431.3333 432.3333 433.3333 434 434.6667 7 436 9
18 458.3333 459.3333 460.3333 461 461.6667 7 463 9
19 485.3333 486.3333 487.3333 488 488.6667 88 88 78
I have tried the following code:
DF1[,c(4:6)][is.na(DF1[,c(4:6)]<-DF2[,c(2:4)][match(DF1[,1],DF2[,1])]
[which(is.na(DF1[,c(4:6)]))]
One of the solutions using dplyr will work, if I omit the columns which are already complete. Not sure if it my version of dplyr, which I have updated last week.
Any help is greatly appreciated! Thanks!
We can do a left join and then coalesce the columns
library(dplyr)
DF1 %>%
left_join(DF2, by = c('NameVar')) %>%
transmute(NameVar, Var1, Var2,
Var3 = coalesce(Var3.x, Var3.y),
Var4 = coalesce(Var4.x, Var4.y),
Var5 = coalesce(Var5.x, Var5.y))
-output
# NameVar Var1 Var2 Var3 Var4 Var5
#1 Sub1 30 45 40 34 65
#2 Sub2 25 30 30 45 45
#3 Sub3 74 34 25 30 49
#4 Sub4 30 45 40 34 65
#5 Sub5 25 30 69 56 72
#6 Sub6 74 34 74 34 60
Or using data.table
library(data.table)
nm1 <- setdiff(intersect(names(DF1), names(DF2)), 'NameVar')
setDT(DF1)[DF2, (nm1) := Map(fcoalesce, mget(nm1),
mget(paste0("i.", nm1))), on = .(NameVar)]
data
DF1 <- structure(list(NameVar = c("Sub1", "Sub2", "Sub3", "Sub4", "Sub5",
"Sub6"), Var1 = c(30L, 25L, 74L, 30L, 25L, 74L), Var2 = c(45L,
30L, 34L, 45L, 30L, 34L), Var3 = c(40L, NA, NA, 40L, 69L, NA),
Var4 = c(34L, NA, NA, 34L, 56L, NA), Var5 = c(65L, NA, NA,
65L, 72L, NA)), class = "data.frame", row.names = c(NA, -6L
))
DF2 <- structure(list(NameVar = c("Sub2", "Sub3", "Sub6"), Var3 = c(30L,
25L, 74L), Var4 = c(45L, 30L, 34L), Var5 = c(45L, 49L, 60L)),
class = "data.frame", row.names = c(NA,
-3L))

Cross join two dataframes by key column using condition in R

I have two dataframes.
mydata1=structure(list(ID_WORKES = c(58005854L, 58005854L, 58002666L,
58002666L), ID_SP_NAR = c(463L, 1951L, 21L, 465L), KOD_DEPO = c(3786L,
3786L, 1439L, 1439L), KOD_DOR = c(58L, 58L, 92L, 92L), COLUMN_MASH = c(6L,
6L, 5L, 5L), prop_violations = structure(c(1L, 2L, 2L, 2L), .Label = c("0.2",
"1"), class = "factor"), mash_score = c(0L, 2L, 2L, 2L)), .Names = c("ID_WORKES",
"ID_SP_NAR", "KOD_DEPO", "KOD_DOR", "COLUMN_MASH", "prop_violations",
"mash_score"), class = "data.frame", row.names = c(NA, -4L))
mydata2=structure(list(ID_SP_NAR = c(463L, 1951L, 21L, 465L, 500L, 600L
)), .Names = "ID_SP_NAR", class = "data.frame", row.names = c(NA,
-6L))
i need crossjoin merge these dataframes by ID_SP_NAR. Mydata2 contatins only key variable ID_SP_NAR.
I need join this in such a way that if the id_workers does not have any codes from the ID_SP_NAR from mydata2, then these code are inserted into the dataset, but for them in variables prop_violations and mash_score must be inserted zero values.
I.E. SP_ID_NAR in mydata2 has such values
ID_SP_NAR
463
1951
21
465
500
600
ID_workes =58005854 has
463,
1951
but another not have.
and
ID_workes =58002666 has 21 and 465 and not anonter!
So desired output after cross join
ID_WORKES ID_SP_NAR KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
1 58005854 463 3786 58 6 0.2 0
2 58005854 1951 3786 58 6 1 2
3 58005854 21 3786 58 6 0 0
4 58005854 465 3786 58 6 0 0
5 58005854 500 3786 58 6 0 0
6 58005854 600 3786 58 6 0 0
7 58002666 21 1439 92 5 1 2
8 58002666 465 1439 92 5 1 2
9 58002666 500 1439 92 5 0 0
10 58002666 600 1439 92 5 0 0
11 58002666 463 1439 92 5 0 0
12 58002666 1951 1439 92 5 0 0
KOD_DEPO,KOD_DOR,COLUMN_MASH have fixed value , it must be saved too.
How to do that?
merge(mydata1,mydata2, by = ID_SP_NAR) is not working( i try use via left join doesn't work), it doesn't insert zeros as i want .
We could use complete from tidyr to expand the dataset based on the 'ID_WORKES' and the valuse of 'ID_SP_NAR' in the second dataset
library(tidyverse)
mydata1 %>%
mutate_if(is.factor, as.character) %>%
complete(ID_WORKES, ID_SP_NAR = mydata2$ID_SP_NAR,
fill = list(prop_violations = '0', mash_score = 0)) %>%
fill(3:5)
# A tibble: 12 x 7
# ID_WORKES ID_SP_NAR KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
# <int> <int> <int> <int> <int> <chr> <dbl>
# 1 58002666 21 1439 92 5 1 2
# 2 58002666 463 1439 92 5 0 0
# 3 58002666 465 1439 92 5 1 2
# 4 58002666 500 1439 92 5 0 0
# 5 58002666 600 1439 92 5 0 0
# 6 58002666 1951 1439 92 5 0 0
# 7 58005854 21 1439 92 5 0 0
# 8 58005854 463 3786 58 6 0.2 0
# 9 58005854 465 3786 58 6 0 0
#10 58005854 500 3786 58 6 0 0
#11 58005854 600 3786 58 6 0 0
#12 58005854 1951 3786 58 6 1 2

R delete first and last x % of rows

I have a data frame with 3 ID variables, then several values for each ID.
user Log Pass Value
2 2 123 342
2 2 123 543
2 2 123 231
2 2 124 257
2 2 124 342
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 342
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
The start and end of each set of values is sometimes noisy, and I want to be able to delete the first few values. Unfortunately the number of values varies significantly, but it is always the first and last 20% of values that are noisy.
I want to delete the first 20% of rows, with a minimum of 1 row deleted.
So for instance if there are 20 values for user 2 log 2 pass 123 I want to delete the first and last 4 rows. If there are only 3 values for the ID variable I want to delete the first and last row.
The resulting dataset would be:
user Log Pass Value
2 2 123 543
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
I've tried fiddling around with nrow but I struggle to figure out how to reference the % of rows by id variable.
Thanks.
Jonathan.
I believe the following can do it.
DATA.
dat <-
structure(list(user = c(2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), Log = c(2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), Pass = c(123L, 123L, 123L, 124L, 124L, 125L, 125L,
125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L,
125L, 125L, 125L), Value = c(342L, 543L, 231L, 257L, 342L, 543L,
231L, 257L, 342L, 543L, 231L, 257L, 543L, 231L, 257L, 543L, 231L,
257L, 543L, 231L, 257L)), .Names = c("user", "Log", "Pass", "Value"
), class = "data.frame", row.names = c(NA, -21L))
CODE.
fun <- function(x, p = 0.20){
n <- nrow(x)
m <- max(1, round(n*p))
inx <- c(seq_len(m), n - seq_len(m) + 1)
x[-inx, ]
}
result <- do.call(rbind, lapply(split(dat, dat$user), fun))
row.names(result) <- NULL
result
# user Log Pass Value
#1 2 2 123 543
#2 2 2 123 231
#3 2 2 124 257
#4 4 3 125 342
#5 4 3 125 543
#6 4 3 125 231
#7 4 3 125 257
#8 4 3 125 543
#9 4 3 125 231
#10 4 3 125 257
#11 4 3 125 543
#12 4 3 125 231
#13 4 3 125 257
Would something like this help?
For a dataframe df:
df[-c(1:floor(nrow(df)*0.2), (1+ceiling(nrow(df)*0.8)):nrow(df)),]
Just removing the first and last 20%, taking the upper and lower values so that for smaller data frame you keep some of the information:
> df<-data.frame(a=1:100)
> df[-c(1:floor(nrow(df)*0.2),(1+ceiling(nrow(df)*0.8)):nrow(df)),]
[1] 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
[31] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
> df<-data.frame(1:3)
> df[-c(1:floor(nrow(df)*0.2),(1+ceiling(nrow(df)*0.8)):nrow(df)),]
[1] 2
You can do this with dplyr...
library(dplyr)
df2 <- df %>% group_by(user, Log, Pass) %>%
filter(n()>2) %>% #remove those with just two elements or fewer
slice(max(2, 1+ceiling(n()*0.2)):min(n()-1, floor(0.8*n())))
df2
user Log Pass Value
1 2 2 123 543
2 4 3 125 543
3 4 3 125 231
4 4 3 125 257
5 4 3 125 543
6 4 3 125 231
7 4 3 125 257
8 4 3 125 543
9 4 3 125 231
Calculate the offset for what you want to retain:
rem <- ceiling( nrow( x ) * .2 ) + 1
Then take out the records you don-t want:
dat <- dat[ rem : ( nrow( dat ) - rem ), ]
Here is an idea using base R that returns the row indices of each user to keep and then subsets on these indices.
idx <- unlist(lapply(split(seq_along(dat[["user"]]), dat[["user"]]), function(x) {
tmp <- max(1, ceiling(.2 * length(x)))
tail(head(x, -tmp), -tmp)}),
use.names=FALSE)
split(seq_along(dat[["user"]]), dat[["user"]]) returns a list of the rows for each user. lapply loops through these rows, calculating the number of rows to drop from each end with split(seq_along(dat[["user"]]), dat[["user"]]), and then dropping them with tail(head(x, -tmp), -tmp)}). Since lapply returns a named list, this is unlisted and the names are dropped.
This returns
idx
2 3 4 10 11 12 13 14 15 16 17
Now subset
dat[idx,]
user Log Pass Value
2 2 2 123 543
3 2 2 123 231
4 2 2 124 257
10 4 3 125 543
11 4 3 125 231
12 4 3 125 257
13 4 3 125 543
14 4 3 125 231
15 4 3 125 257
16 4 3 125 543
17 4 3 125 231

R: Can you specify the order of variable columns from reshape/cast?

I'm using the cast function to create a wide formatted data frame. I'd like to be able to control the ordering of the columns that result from using cast. Is this possible?
In the example below, they are order: cogs_xdep, sales, sga
I would like to order them: sales, cogs_xdep, sga
The whole process actually starts with a wide formatted frame that I use melt to change into a long format before the cast.
An example is below
>rawdata_all
coy_name gvkey_iid factor X20130930 X20130831 X20130731 X20130630 X20130531 X20130430 X20130331 X20130228 X20130131 X20121231
1 Coy 1 111111 sales 1 2 3 4 5 6 7 8 9 10
2 Coy 2 22222 sales 2 12 22 32 42 52 62 72 82 92
3 Coy 3 333333 sales 3 103 203 303 403 503 603 703 803 903
4 Coy 1 111111 cogs_xdep 4 5 6 7 8 9 10 11 12 13
5 Coy 2 22222 cogs_xdep 5 15 25 35 45 55 65 75 85 95
6 Coy 3 333333 cogs_xdep 6 106 206 306 406 506 606 706 806 906
7 Coy 1 111111 sga 7 8 9 10 11 12 13 14 15 16
8 Coy 2 22222 sga 8 18 28 38 48 58 68 78 88 98
9 Coy 3 333333 sga 9 109 209 309 409 509 609 709 809 909
...
Melt to put it in long format
> non_data_cols <- 3 # There are 3 non-value columns
> master_long <- melt(rawdata_all, id=1:non_data_cols,measured=(non_data_cols+1):length(rawdata_all))
> master_long
coy_name gvkey_iid factor variable value
1 Coy 1 111111 sales X20130930 1
2 Coy 2 22222 sales X20130930 2
3 Coy 3 333333 sales X20130930 3
4 Coy 1 111111 cogs_xdep X20130930 4
5 Coy 2 22222 cogs_xdep X20130930 5
6 Coy 3 333333 cogs_xdep X20130930 6
7 Coy 1 111111 sga X20130930 7
8 Coy 2 22222 sga X20130930 8
...
Finally cast on 'factor' to create wider data frame. (I also renamed 'variable' to 'date' and removed the 'X' from the start of the date values).
> master <- cast(master_long, ...~factor)
coy_name gvkey_iid date cogs_xdep sales sga
1 Coy 1 111111 20130930 4 1 7
2 Coy 1 111111 20130831 5 2 8
3 Coy 1 111111 20130731 6 3 9
4 Coy 1 111111 20130630 7 4 10
5 Coy 1 111111 20130531 8 5 11
6 Coy 1 111111 20130430 9 6 12
7 Coy 1 111111 20130331 10 7 13
8 Coy 1 111111 20130228 11 8 14
9 Coy 1 111111 20130131 12 9 15
10 Coy 1 111111 20121231 13 10 16
...
I would ideally like to have the final 3 columns in the following order: sales, cogs_xdep, sga. cast appears to have arranged them alphabetically as you can see they are ordered in the desired way in both the original data frame and the long formatted data frame.
Any suggestions would be greatly appreciated. While it is easier enough to rearrange the columns with only 3, it's more cumbersome in the real situation of 30+ columns.
Thanks,
I don't see a reason, why column order should matter. However, you can always change the order like this:
master[,c(names(master)[1:3], as.character(unique(master_long$factor)))]
coy_name gvkey_iid variable sales cogs_xdep sga
1 Coy_1 111111 X20130930 1 4 7
2 Coy_1 111111 X20130831 2 5 8
3 Coy_1 111111 X20130731 3 6 9
4 Coy_1 111111 X20130630 4 7 10
5 Coy_1 111111 X20130531 5 8 11
6 Coy_1 111111 X20130430 6 9 12
7 Coy_1 111111 X20130331 7 10 13
8 Coy_1 111111 X20130228 8 11 14
9 Coy_1 111111 X20130131 9 12 15
10 Coy_1 111111 X20121231 10 13 16
11 Coy_2 22222 X20130930 2 5 8
12 Coy_2 22222 X20130831 12 15 18
13 Coy_2 22222 X20130731 22 25 28
14 Coy_2 22222 X20130630 32 35 38
15 Coy_2 22222 X20130531 42 45 48
16 Coy_2 22222 X20130430 52 55 58
17 Coy_2 22222 X20130331 62 65 68
18 Coy_2 22222 X20130228 72 75 78
19 Coy_2 22222 X20130131 82 85 88
20 Coy_2 22222 X20121231 92 95 98
21 Coy_3 333333 X20130930 3 6 9
22 Coy_3 333333 X20130831 103 106 109
23 Coy_3 333333 X20130731 203 206 209
24 Coy_3 333333 X20130630 303 306 309
25 Coy_3 333333 X20130531 403 406 409
26 Coy_3 333333 X20130430 503 506 509
27 Coy_3 333333 X20130331 603 606 609
28 Coy_3 333333 X20130228 703 706 709
29 Coy_3 333333 X20130131 803 806 809
30 Coy_3 333333 X20121231 903 906 909
Note that package reshape2 has mostly superseded reshape.
I found here: dcast - order factors how you can tell dcast do not change the order of factors alphabetically.
dff<-structure(list(ID_no = c(2, 2, 2, 2, 2, 29, 29, 29, 29, 29),
Variable = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
.Label = c("q1", "q2" ), class = "factor"),
Levels = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L),
.Label = c("vf1", "df2", "fd3", "re4", "fr5" ), class = "factor"),
Frequency = c(2, 15, 41, 28, 20, 6, 0, 5, 27, 82),
Percentage = c(1.89, 14.15, 38.68, 26.42, 18.87, 5, 0, 4.17, 22.5, 68.33)),
.Names = c("ID_no", "Variable", "Levels", "Frequency", "Percentage"),
row.names = c(NA, -10L), class = c("data.table", "data.frame"))
ID_no Variable Levels Frequency Percentage
1: 2 q1 vf1 2 1.89
2: 2 q1 df2 15 14.15
3: 2 q1 fd3 41 38.68
4: 2 q1 re4 28 26.42
5: 2 q1 fr5 20 18.87
6: 29 q2 vf1 6 5.00
7: 29 q2 df2 0 0.00
8: 29 q2 fd3 5 4.17
9: 29 q2 re4 27 22.50
10: 29 q2 fr5 82 68.33
dcast(setDT(dff), Variable~factor(Levels, levels=unique(Levels)), value.var=c("Percentage"))
Some random factor levels, with their number to indicate their order.
Variable vf1 df2 fd3 re4 fr5
1: q1 1.89 14.15 38.68 26.42 18.87
2: q2 5.00 0.00 4.17 22.50 68.33
Hope this help.

Resources