I use that function to plot all of the rows from my data frame into pdf file.
library(gplots)
# open the pdf file
pdf(file='Plots of all rows.pdf')
x <- 1:(ncol(tbl_alles)-1)
for(i in 1:nrow(tbl_alles)){
# plot onto a new pdf page
plot(x=x,y=tbl_alles[i,-1],type='b',main=tbl_alles[i,1],xlab='X',ylab='Y')
}
# close the pdf file
dev.off()
How to just plot like 3 rows on the same graph with different colours ?
As a legend use the name of the row...
my data:
> dput(head(tbl_alles))
structure(list(`10` = c(0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0,
0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0), `81` = c(0, 0, 1, 0,
0, 0), `110` = c(0, 0, 0, 0, 0, 0), `140.5` = c(0, 0, 0, 0, 0,
0), `189` = c(0, 0, 0, 0, 0, 0), `222.5` = c(0, 0, 0, 0, 0, 0
), `278` = c(0, 0, 0, 0, 0, 0), `340` = c(0, 0, 0, 0, 0, 0),
`397` = c(0, 1, 0, 0, 0, 0), `453.5` = c(0, 0.66069369, 0,
0, 0, 1), `529` = c(0, 0.521435654, 0, 0, 1, 0), `580` = c(0,
0.437291195, 0, 0, 1, 0), `630.5` = c(0, 0.52204783, 0, 0,
0, 0), `683.5` = c(0, 0.52429838, 0, 0, 0, 0), `735.5` = c(1,
0.3768651, 0, 1, 0, 0), `784` = c(0, 0, 0, 0, 0, 0), `832` = c(0,
0, 0, 0, 0, 0), `882.5` = c(0, 0, 0, 0, 0, 0), `926.5` = c(0,
0, 0, 0, 0, 0), `973` = c(0, 0, 0, 0, 0, 0), `1108` = c(0,
0, 0, 0, 0, 0), `1200` = c(0, 0, 0, 0, 0, 0)), .Names = c("10",
"20", "52.5", "81", "110", "140.5", "189", "222.5", "278", "340",
"397", "453.5", "529", "580", "630.5", "683.5", "735.5", "784",
"832", "882.5", "926.5", "973", "1108", "1200"), row.names = c("at1g01050.1",
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01420.1", "at1g01470.1"
), class = "data.frame")
That's an output which I would like to see. Just put both plots on the some graph with different colours.
Edit:
> tbl_alles[1066,]
10 20 52.5 81 110 140.5 189 222.5 278 340 397 453.5 529 580 630.5 683.5 735.5 784 832
at3g01510.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
882.5 926.5 973 1108 1200
at3g01510.1 0 0 0 0 0
> tbl_alles[2269,]
10 20 52.5 81 110 140.5 189 222.5 278 340 397 453.5 529 580 630.5 683.5 735.5 784 832
at5g26570.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
882.5 926.5 973 1108 1200
at5g26570.1 0 0 0 0 0
> tbl_alles[109,]
10 20 52.5 81 110 140.5 189 222.5 278 340 397 453.5 529 580 630.5 683.5 735.5
at1g10760.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.5707348 0.8569183
784 832 882.5 926.5 973 1108 1200
at1g10760.1 0.9070698 1 0 0 0 0 0
That's what I would like to achieve:
I always forget about matplot, I'm not familiar with gplot, so I think it fits this well
Here's a small df I made to replicate your problem
df<-structure(list(`10` = c(-0.371350351694367, -1.25087314646208,
-0.657843532649542), `20` = c(-1.1851240454548, -2.78219491586262,
0.710133251591709), `52.5` = c(1.68304915266843, -0.496047552485386,
0.450948253352661), `81` = c(0.129735967979554, 0.254350517817915,
-0.18288033694209), `110` = c(-1.13792416656106, 0.685305257987392,
-0.00333217347885503), `140.5` = c(-3.60035706471287, 0.147934251860607,
1.21899119774361), `189` = c(0.20579357437275, -0.287714362235557,
-0.116174227557464), `222.5` = c(-2.27228706498774, 1.14330151676478,
0.437320821753322), `278` = c(0.0184198982292088, 1.03280897369263,
-0.809374548090546), `340` = c(1.10314005160547, -0.423635755487127,
1.69263287858465), `397` = c(1.11000715987197, -0.666713965188873,
-0.616331157049669), `453.5` = c(-0.21763982556254, -0.668405154120432,
-0.101290244002727), `529` = c(-1.23612437632145, 0.596727546451954,
1.65926804193271), `580` = c(-1.36914007441172, 0.553436632958187,
0.370452646201169), `630.5` = c(-1.28739943545904, 1.51605377604701,
0.535674548844182), `683.5` = c(0.0380431318762389, 0.259944835916881,
-1.12267356731606), `735.5` = c(0.307116139352162, 0.619942543650423,
1.43847332323359), `784` = c(0.517011770731407, -0.271348876993244,
0.382706886840812), `832` = c(0.358756221875511, 0.902328658122764,
2.19653579973421), `882.5` = c(0.60565816196684, -1.69443962691366,
-0.338433483486653), `926.5` = c(2.15044754686289, -0.979574461038407,
0.116260893315264), `973` = c(-1.3051680247044, -0.735063827396212,
-1.55018820456708), `1108` = c(-0.108476761260576, 1.21094890415222,
-1.04130290709525), `1200` = c(-0.963125050259433, -1.12921931676616,
-0.357160373571803)), .Names = c("10", "20", "52.5", "81", "110",
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529",
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5",
"973", "1108", "1200"), row.names = c("at1g01050.1", "at1g01080.1",
"at1g01090.1"), class = "data.frame")
Now your code, notice I use matplot, transpose the df, convert it to a matrix (for matplot), and specify a type for each of the data sets we'll be plotting
library(gplots)
# open the pdf file
pdf(file='Plots of all rows.pdf')
rowsToPlot<-c(1,2,3)
# plot onto a new pdf page
matplot(as.matrix(t(df[rowsToPlot,])),type=rep("l", length(rowsToPlot)), col=rainbow(length(rowsToPlot)))
# close the pdf file
dev.off()
And my picture
Related
I'm trying to merge two data.frames in R so that the values in test_1 are overwritten if they exist in test_2
Every time I try and join/merge them I end up with a bunch of NAs.
I can't work out a simple logic for if.na then use test_2. Is that what I'd need to do, or is there an easier way?
test_1 = structure(list(rn = c("Red", "Blue",
"Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(0, 0, 0, 0, 0, 0), X2022.08.02 = c(0, 0, 0,
0, 0, 0), X2022.08.03 = c(0, 0, 0, 0, 0, 0), X2022.08.04 = c(0,
0, 0, 0, 0, 0), X2022.08.05 = c(0, 0, 0, 0, 0, 0), X2022.08.08 = c(0,
0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0,
0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0,
0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0,
0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0,
0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0,
0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0,
0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(0,
0, 0, 0, 0, 0), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0,
0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
6L), class = "data.frame")
test_2 = structure(list(rn = c("Blue", "Pink",
"Red", "Yellow", "Green", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.03 = c(10, 10, 10,
10, 10, 10), X2022.08.04 = c(10, 10, 10, 10, 10, 10), X2022.08.05 = c(10,
10, 10, 10, 10, 10), X2022.08.26 = c(10, 10, 10, 10, 10, 10)), row.names = c(NA,
6L), class = "data.frame")
the desired output would look like this:
test_output = structure(list(rn = c("Red", "Blue",
"Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.02 = c(0, 0, 0,
0, 0, 0), X2022.08.03 = c(10, 10, 10, 10, 10, 10), X2022.08.04 = c(10,
10, 10, 10, 10, 10), X2022.08.05 = c(10, 10, 10, 10, 10, 10), X2022.08.08 = c(0,
0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0,
0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0,
0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0,
0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0,
0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0,
0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0,
0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(10,
10, 10, 10, 10, 10), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0,
0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
6L), class = "data.frame")
If you use the column names of the source as indices on both sides of the assignment, you can get replacement with
test_1[ ,colnames(test_2)] <- test_2[ , colnames(test_2)]
test_1
rn X2022.08.01 X2022.08.02 X2022.08.03 X2022.08.04 X2022.08.05 X2022.08.08 X2022.08.09
1 Blue 10 0 10 10 10 0 0
2 Pink 10 0 10 10 10 0 0
3 Red 10 0 10 10 10 0 0
4 Yellow 10 0 10 10 10 0 0
5 Green 10 0 10 10 10 0 0
6 Gold 10 0 10 10 10 0 0
X2022.08.10 X2022.08.11 X2022.08.12 X2022.08.15 X2022.08.16 X2022.08.17 X2022.08.18 X2022.08.19
1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
X2022.08.22 X2022.08.23 X2022.08.24 X2022.08.25 X2022.08.26 X2022.08.29 X2022.08.30 X2022.08.31
1 0 0 0 0 10 0 0 0
2 0 0 0 0 10 0 0 0
3 0 0 0 0 10 0 0 0
4 0 0 0 0 10 0 0 0
5 0 0 0 0 10 0 0 0
6 0 0 0 0 10 0 0 0
I doubt there could be a more simple method.. It could even work with a more limited number of rows as long as the indices area proper subset of the column and row names of the target matrix or dataframe. Note: I'm not understanding the issue with NA's. There were not NA's in either structure.
i have two dataframes with another sizes but partly same row and column names.
I want to filter tpm_datExpr by finding the common column and row names with datTraits.
So I want my tpm_datExpr dataframe to have 278 columns.
> colnames(tpm_datExpr)[1:10]
[1] "D5247_S53_L006" "D5248_S54_L006" "D5249_S67_L008" "E02874_L1_S1_L001"
[5] "E02875_L1_S2_L001" "E02876_L1_S3_L001" "E02877_L1_S4_L001" "E02878_L1_S5_L001"
[9] "E02879_L1_S6_L001" "E02880_L1_S7_L001"
> rownames(datTraits)[1:10]
[1] "D5247_S53_L006" "D5248_S54_L006" "D5249_S67_L008" "E02874_L1_S1_L001"
[5] "E02875_L1_S2_L001" "E02876_L1_S3_L001" "E02877_L1_S4_L001" "E02878_L1_S5_L001"
[9] "E02879_L1_S6_L001" "E02880_L1_S7_L001"
> ncol(tpm_datExpr)
[1] 623
> nrow(datTraits)
[1] 278
You may use intersect on rownames and names. Example:
df1[intersect(rownames(df1), rownames(df2)),
intersect(names(df1), names(df2))]
# X1 X5 X6
# 2 0 0 0
# 4 0 0 0
# 8 0 0 0
# 9 0 0 0
Data:
df1 <- structure(list(X1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0), X4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X5 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0), X6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-10L))
df2 <- structure(list(X1 = c(0, 0, 0, 0), X5 = c(0, 0, 0, 0), X6 = c(0,
0, 0, 0)), row.names = c(2L, 4L, 8L, 9L), class = "data.frame")
require(gtsummary)
test <- structure(list(`1` = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), `2` = c(1,0, 0, 0, 0, 1, 0, 1, 0, 0), `3` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `4` = c(1, 1, 0, 0, 1, 0, 0, 0, 0, 0), `5` = c(1, 0, 1, 1,0, 1, 1, 0, 0, 0), `6` = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0), `7` = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), `8` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `9` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `10` = c(0, 0, 0,0, 0, 0, 0, 0, 0, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl", "data.frame"))
In this example data, I have 10 categorical variables.
`1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 1 0 1 1 0 0 0 0 0
2 0 0 0 1 0 0 0 0 0 0
3 0 0 0 0 1 0 0 0 0 0
4 0 0 0 0 1 1 0 0 0 0
5 0 0 0 1 0 0 0 0 0 0
6 0 1 0 0 1 0 0 0 0 0
7 0 0 0 0 1 1 0 0 0 0
8 0 1 0 0 0 0 0 0 0 0
9 1 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 1
Since they can overlap each other, I have put them in different columns,
using 0 and 1, indicatting "yes" or "no" to having (or not having) the categorical variable.
When test %>% tbl_summary(), it creates:
I would like to sort this by frequency, but
test %>% tbl_summary(sort = list(everything() ~ "frequency"))
does not work.
Is there anyway to do this?
Thank you in advance.
The tbl_summary(sort=) argument sorts levels within a variable, not the order the variables appear in the table. Variables are appear in the table in the same order they appear in the data frame.
We can update the order in the data frame using the code below.
library(gtsummary)
#> #Uighur
packageVersion("gtsummary")
#> [1] '1.5.0'
test <- structure(list(`1` = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), `2` = c(1,0, 0, 0, 0, 1, 0, 1, 0, 0), `3` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `4` = c(1, 1, 0, 0, 1, 0, 0, 0, 0, 0), `5` = c(1, 0, 1, 1,0, 1, 1, 0, 0, 0), `6` = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0), `7` = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), `8` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `9` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `10` = c(0, 0, 0,0, 0, 0, 0, 0, 0, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl", "data.frame"))
# order variables by prevelence
prev <- purrr::map_dbl(test, mean) %>% sort(decreasing = TRUE)
test %>%
select(all_of(names(prev))) %>%
tbl_summary() %>%
as_kable() # convert to kable for SO
Characteristic
N = 10
5
5 (50%)
2
3 (30%)
4
3 (30%)
6
2 (20%)
1
1 (10%)
10
1 (10%)
3
0 (0%)
7
0 (0%)
8
0 (0%)
9
0 (0%)
Created on 2021-12-10 by the reprex package (v2.0.1)
I'm trying to minus values for each habitat covariate relative to year 2019 and 2010. So, something that can assign by ID those values belonging to each habitat for 2010 and 2019, minus them, otherwise, those that aren't grouped by ID are left as is in the dataframe.
Here's an example of the dataset and what I expect for the output:
#dataset example
# A tibble: 30 x 18
id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~ pland_06_closed~
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 267 2019 0.0833 0 0 0 0 0 0
2 268 2019 0.2 0 0 0 0 0 0
3 362 2019 0.1 0 0 0 0 0 0
4 420 2019 0.0556 0 0 0 0 0 0
5 421 2019 0.0667 0 0 0 0 0 0
6 484 2019 0.125 0 0 0 0 0 0
7 492 2010 0.1 0 0 0 0 0 0
8 492 2019 0.1 0 0 0 0 0 0
9 719 2010 0.0769 0 0 0 0 0 0
10 719 2019 0.0769 0 0 0 0 0 0
#output example
# A tibble: 30 x 18
id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~ pland_06_closed~
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 267 2019 0.0833 0 0 0 0 0 0
2 268 2019 0.2 0 0 0 0 0 0
3 362 2019 0.1 0 0 0 0 0 0
4 420 2019 0.0556 0 0 0 0 0 0
5 421 2019 0.0667 0 0 0 0 0 0
6 484 2019 0.125 0 0 0 0 0 0
7 492 changed 0 0 0 0 0 0 0
9 719 changed 0 0 0 0 0 0 0
I can imagine this working with a function and boolean operators such that, if year 2010 & 2019 match by id then minus the next row by the previous (assuming that they're ordered by id then this should work), otherwise, if they do not match by id then leave them as is.
I'm trying to wrap my head around which code to use for this, I can see this working within a function and using lapply to apply across the entire dataset.
Here's a reproducible code:
structure(list(id = c(267L, 268L, 362L, 420L, 421L, 484L, 492L,
492L, 719L, 719L, 986L, 986L, 1071L, 1071L, 1303L, 1303L, 1306L,
1399L, 1399L, 1400L, 1400L, 2007L, 2083L, 2083L, 2134L, 2135L,
2136L, 2213L, 2213L, 2214L), year = c(2019, 2019, 2019, 2019,
2019, 2019, 2010, 2019, 2010, 2019, 2010, 2019, 2010, 2019, 2010,
2019, 2010, 2010, 2019, 2010, 2019, 2019, 2010, 2019, 2019, 2019,
2019, 2010, 2019, 2010), pland_00_water = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.05, 0.05, 0.111111111111111,
0.111111111111111, 0.0526315789473684, 0.142857142857143, 0.142857142857143,
0.0666666666666667, 0.0588235294117647, 0.1, 0.142857142857143,
0.142857142857143, 0.25), pland_01_evergreen_needleleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0588235294117647, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_02_evergreen_broadleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_03_deciduous_needleleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0714285714285714, 0, 0,
0, 0, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_04_deciduous_broadleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0714285714285714, 0.0714285714285714,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_05_mixed_forest = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_06_closed_shrubland = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), pland_07_open_shrubland = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), pland_08_woody_savanna = c(0, 0, 0, 0, 0, 0,
0, 0, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_09_savanna = c(0,
0, 0, 0, 0, 0, 0, 0, 0.0769230769230769, 0.0769230769230769,
0.0588235294117647, 0.0588235294117647, 0, 0, 0, 0.0769230769230769,
0.0588235294117647, 0.05, 0.05, 0.111111111111111, 0.111111111111111,
0, 0, 0, 0, 0, 0, 0, 0, 0), pland_10_grassland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.05, 0.05, 0.111111111111111,
0.111111111111111, 0.0526315789473684, 0.142857142857143, 0.142857142857143,
0.0666666666666667, 0.0588235294117647, 0.1, 0.142857142857143,
0.142857142857143, 0.25), pland_11_wetland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0, 0, 0.1, 0.1, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.0588235294117647, 0.0714285714285714,
0.0714285714285714, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.05, 0.05, 0.111111111111111, 0, 0.0526315789473684, 0.142857142857143,
0.142857142857143, 0.0666666666666667, 0.0588235294117647, 0.1,
0.142857142857143, 0.142857142857143, 0), pland_12_cropland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0, 0, 0, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.05, 0.05, 0.111111111111111, 0.111111111111111, 0.0526315789473684,
0.142857142857143, 0.142857142857143, 0.0666666666666667, 0,
0, 0.142857142857143, 0.142857142857143, 0.25), pland_13_urban = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_14_mosiac = c(0, 0, 0, 0, 0, 0,
0, 0, 0.0769230769230769, 0.0769230769230769, 0, 0.0588235294117647,
0, 0, 0, 0, 0, 0.05, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
pland_15_barren = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Here's a tidyverse version:
library(dplyr)
x %>%
arrange(year) %>%
# can add 'id' if desired, minimum 'year' required for below
group_by(id) %>%
filter(
all(c("2010", "2019") %in% year),
year %in% c("2010", "2019")
) %>%
summarize_at(vars(-year), diff) %>%
mutate(year = "changed") %>%
ungroup() %>%
bind_rows(x, .) %>%
arrange(id, year) # just to show id=492
# # A tibble: 39 x 18
# id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 267 2019 0.0833 0 0 0 0 0
# 2 268 2019 0.2 0 0 0 0 0
# 3 362 2019 0.1 0 0 0 0 0
# 4 420 2019 0.0556 0 0 0 0 0
# 5 421 2019 0.0667 0 0 0 0 0
# 6 484 2019 0.125 0 0 0 0 0
# 7 492 2010 0.1 0 0 0 0 0
# 8 492 2019 0.1 0 0 0 0 0
# 9 492 chan~ 0 0 0 0 0 0
# 10 719 2010 0.0769 0 0 0 0 0
# # ... with 29 more rows, and 10 more variables: pland_06_closed_shrubland <dbl>, pland_07_open_shrubland <dbl>,
# # pland_08_woody_savanna <dbl>, pland_09_savanna <dbl>, pland_10_grassland <dbl>, pland_11_wetland <dbl>,
# # pland_12_cropland <dbl>, pland_13_urban <dbl>, pland_14_mosiac <dbl>, pland_15_barren <dbl>
Explanation:
the first arrange(year) is so that the diff later will have values in an expected order (assuming all years are year-like that sort lexicographically the same as a numerical sort);
the filter first removes any ids that do not have both years, and then ensures we have only those two years; while your data only contains "2010" and "2019", I didn't want to assume that ... it's a harmless filter if that's all you have, remove year %in% c("2010","2019") if desired and safe;
I assume that columns other than id and year are numeric/integer, so summarize_at(vars(-year), diff) is safe (id is out of the picture since it is a grouping variable); if there are non-numerical values, you might be able to use summarize_if(is.numeric, diff) which also works here ... but will silently NA-ize non-numeric fields if present;
bind_rows(x, .) is needed because the filter removed many rows we want/need to retain; and
the last arrange(id,year) is solely demonstrative for this answer.
This question already has answers here:
How to select the rows with maximum values in each group with dplyr? [duplicate]
(6 answers)
Closed 2 years ago.
I want to group by data set based on some IDs, then leave the grouped data that has largest value in the column. Here is a description of my data set.
BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN TRNID TRNID2 TRNID3 TRNID4 TRNID5 count
1 150 0 0 0 0 0 0 0 0 152 1674 0 0 0 0 1
2 150 0 0 0 0 0 0 0 0 152 1676 0 0 0 0 2
3 150 0 0 0 0 0 0 0 0 152 1678 0 0 0 0 2
4 150 0 0 0 0 0 0 0 0 152 1680 0 0 0 0 13
5 150 0 0 0 0 0 0 0 0 152 1682 0 0 0 0 3
6 150 0 0 0 0 0 0 0 0 152 1684 0 0 0 0 4
I want to group and summarise this data into a single row based on IDs the first 10 columns BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN.
Then for the rest of the columns, TRNID TRNID2 TRNID3 TRNID4 TRNID5 I would like to replace them with the row with maximum value in column count.
What I want as my final output would look as below.
BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN TRNID TRNID2 TRNID3 TRNID4 TRNID5 count
150 0 0 0 0 0 0 0 0 152 1680 0 0 0 0 13
How would summarise my data? I have 2,931,959 rows with more groups of BSTN, ASTNs.
dput(head(A_Routetable2))
structure(list(BSTN = c(150, 150, 150, 150, 150, 150), ASTN1 = c(0,
0, 0, 0, 0, 0), BSTN2 = c(0, 0, 0, 0, 0, 0), ASTN2 = c(0, 0,
0, 0, 0, 0), BSTN3 = c(0, 0, 0, 0, 0, 0), ASTN3 = c(0, 0, 0,
0, 0, 0), BSTN4 = c(0, 0, 0, 0, 0, 0), ASTN4 = c(0, 0, 0, 0,
0, 0), BSTN5 = c(0, 0, 0, 0, 0, 0), ASTN = c(152, 152, 152, 152,
152, 152), TRNID = c(1674, 1676, 1678, 1680, 1682, 1684), TRNID2 = c(0,
0, 0, 0, 0, 0), TRNID3 = c(0, 0, 0, 0, 0, 0), TRNID4 = c(0, 0,
0, 0, 0, 0), TRNID5 = c(0, 0, 0, 0, 0, 0), count = c(1L, 2L,
2L, 13L, 3L, 4L)), row.names = c(NA, -6L), groups = structure(list(
BSTN = c(150, 150, 150, 150, 150, 150), ASTN1 = c(0, 0, 0,
0, 0, 0), BSTN2 = c(0, 0, 0, 0, 0, 0), ASTN2 = c(0, 0, 0,
0, 0, 0), BSTN3 = c(0, 0, 0, 0, 0, 0), ASTN3 = c(0, 0, 0,
0, 0, 0), BSTN4 = c(0, 0, 0, 0, 0, 0), ASTN4 = c(0, 0, 0,
0, 0, 0), BSTN5 = c(0, 0, 0, 0, 0, 0), ASTN = c(152, 152,
152, 152, 152, 152), TRNID = c(1674, 1676, 1678, 1680, 1682,
1684), TRNID2 = c(0, 0, 0, 0, 0, 0), TRNID3 = c(0, 0, 0,
0, 0, 0), TRNID4 = c(0, 0, 0, 0, 0, 0), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You can group_by position and then select row with max value in count.
library(dplyr)
df %>% group_by(across(1:10)) %>% slice(which.max(count))
# BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN TRNID TRNID2 TRNID3 TRNID4 TRNID5 count
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1 150 0 0 0 0 0 0 0 0 152 1680 0 0 0 0 13
Or group by range of columns
df %>% group_by(across(BSTN:ASTN)) %>%slice(which.max(count))
The dput shared by OP is grouped which results an error with across. We can ungroup the data first and run the above which runs without any error. However functions in the previous version of dplyr work without any error on it. For example - group_by_at
A_Routetable2 %>% group_by_at(1:10) %>% slice(which.max(count))