Sorry I can't embed pictures yet
I have 21 data frames in a list (listb), all with the same headings of Timestamp, Rainfall
I would like to sort them by Rainfall (descending) and then subset the top 30 (to include the corresponding Timestamp) of each of the 21 data frames. Then put them back into a single dataframe with the name of the initial data frame as a heading?
Please find the list of data frames below, and a small cut from the b1 dataframe
Would I need to create a new dataframe for each of the new subsets then combine them into a list later?
Descending_b1 <- listb$b1[order(-Rainfall),]
b1_30 <- Descending_b1[1:30,1:2]
From that, I produce the following
b1_30 <- structure(list(Timestamp = c("25/1/2013", "24/1/2013", "2/2/2004",
"21/3/2010", "16/7/2016", "1/2/2010", "26/1/2007", "29/12/1998",
"24/2/2008", "5/2/2003", "6/2/2003", "11/11/2001", "3/12/2010",
"8/3/2020", "27/12/2010", "29/1/1998", "18/10/2017", "13/3/2007",
"5/4/2006", "10/6/2006", "19/11/2008", "20/2/2015", "26/3/2014",
"15/3/2017", "27/8/2011", "1/3/2013", "27/8/1998", "11/2/2012",
"11/2/2008", "26/1/2013"),
Rainfall = c(238L, 158L, 131L, 131L,129L, 122L, 112L, 109L, 101L, 94L,
92L, 88L, 82L, 81L, 78L, 74L, 71L, 69L, 65L, 64L, 64L,
64L, 63L, 63L, 62L, 61L, 60L, 60L, 58L,57L)),
row.names = c(5915L, 5914L, 2640L, 4874L, 7183L, 4826L, 3725L, 939L, 4118L, 2278L, 2279L, 1827L, 5131L, 8514L, 5155L,
605L, 7642L, 3771L, 3429L, 3495L, 4387L, 6671L, 6340L, 7425L,
5398L, 5950L, 815L, 5566L, 4105L, 5916L), class = "data.frame")
b1_30
#> Timestamp Rainfall
#> 5915 25/1/2013 238
#> 5914 24/1/2013 158
#> 2640 2/2/2004 131
#> 4874 21/3/2010 131
#> 7183 16/7/2016 129
#> 4826 1/2/2010 122
#> 3725 26/1/2007 112
#> 939 29/12/1998 109
#> 4118 24/2/2008 101
#> 2278 5/2/2003 94
#> 2279 6/2/2003 92
#> 1827 11/11/2001 88
#> 5131 3/12/2010 82
#> 8514 8/3/2020 81
#> 5155 27/12/2010 78
#> 605 29/1/1998 74
#> 7642 18/10/2017 71
#> 3771 13/3/2007 69
#> 3429 5/4/2006 65
#> 3495 10/6/2006 64
#> 4387 19/11/2008 64
#> 6671 20/2/2015 64
#> 6340 26/3/2014 63
#> 7425 15/3/2017 63
#> 5398 27/8/2011 62
#> 5950 1/3/2013 61
#> 815 27/8/1998 60
#> 5566 11/2/2012 60
#> 4105 11/2/2008 58
#> 5916 26/1/2013 57
So yeah I hope to do that with the rest of the data frames within the list to create a new data frame whilst keeping the initial data frame name, and then combine them into a new list
Suppose you have a list like this
set.seed(2021)
listb <- list(b1 = data.frame(Timestamp = as.Date("2010-01-01") + days(sample(1:100, 10)),
Rainfall = sample(200:300, 10)),
b2 = data.frame(Timestamp = as.Date("2010-01-01") + days(sample(1:100, 10)),
Rainfall = sample(200:300, 10)),
b3 = data.frame(Timestamp = as.Date("2010-01-01") + days(sample(1:100, 10)),
Rainfall = sample(200:300, 10)))
> listb
$b1
Timestamp Rainfall
1 2010-01-08 275
2 2010-02-08 250
3 2010-02-16 259
4 2010-02-28 217
5 2010-01-13 298
6 2010-03-12 202
7 2010-03-06 245
8 2010-04-10 225
9 2010-03-11 235
10 2010-01-24 285
$b2
Timestamp Rainfall
1 2010-02-01 242
2 2010-04-09 258
3 2010-01-20 269
4 2010-03-10 285
5 2010-03-28 298
6 2010-01-06 262
7 2010-03-15 278
8 2010-03-05 233
9 2010-02-08 221
10 2010-01-19 215
$b3
Timestamp Rainfall
1 2010-03-21 216
2 2010-03-30 240
3 2010-01-18 230
4 2010-01-21 272
5 2010-03-10 292
6 2010-04-05 226
7 2010-03-14 210
8 2010-03-25 235
9 2010-03-09 237
10 2010-01-03 278
Now you need to do this only (Needless to say replace n argument in slice_max with your desired n=30)
purrr::map2_dfr(listb, names(listb), ~ .x %>%
mutate(list_name = .y) %>%
slice_max(Rainfall, n=5))
Timestamp Rainfall list_name
1 2010-01-13 298 b1
2 2010-01-24 285 b1
3 2010-01-08 275 b1
4 2010-02-16 259 b1
5 2010-02-08 250 b1
6 2010-03-28 298 b2
7 2010-03-10 285 b2
8 2010-03-15 278 b2
9 2010-01-20 269 b2
10 2010-01-06 262 b2
11 2010-03-10 292 b3
12 2010-01-03 278 b3
13 2010-01-21 272 b3
14 2010-03-30 240 b3
15 2010-03-09 237 b3
If you want to return the output back into a similar list
purrr::map(listb, ~ .x %>%
slice_max(Rainfall, n=5))
$b1
Timestamp Rainfall
1 2010-01-13 298
2 2010-01-24 285
3 2010-01-08 275
4 2010-02-16 259
5 2010-02-08 250
$b2
Timestamp Rainfall
1 2010-03-28 298
2 2010-03-10 285
3 2010-03-15 278
4 2010-01-20 269
5 2010-01-06 262
$b3
Timestamp Rainfall
1 2010-03-10 292
2 2010-01-03 278
3 2010-01-21 272
4 2010-03-30 240
5 2010-03-09 237
Related
i have a data frame that looks like this;
Date Value1 Value 2 Value 3
1997Q1 100 130 120
1997Q1 100 130 124
1997Q1 120 136 154
1997Q2 180 145 154
1997Q2 186 134 126
1997Q2 186 124 176
1997Q3 190 143 176
1997Q3 192 143 123
I would like to calculate differences for each values within the same date, for example the differences in value 1 column for 1997q1, then 1997q2 and so on.
I would like these differences to be shown in a new column, so that the results would look something like this;
Date Value1 Value 2 Value 3 Diff Val 1 Diff Val 2 Diff Val 3
1997Q1 100 130 120 0 0 4
1997Q1 100 130 124 20 6 30
1997Q1 120 136 154 N/A N/A N/A
1997Q2 180 145 154 6 -11 -28
1997Q2 186 134 126 0 10 50
1997Q2 186 124 176 N/A N/A N/A
1997Q3 190 143 176 2 0 -53
1997Q3 192 143 123
You can use dplyr functions for this. The ~ .x - lead(.x) is the function applied to every value column, selected with starts_with. we take the current value minus the next value. If you need lag, switch it around, ~ lag(.x) - .x
library(dplyr)
df1 %>%
group_by(Date) %>%
mutate(across(starts_with("Value"), ~.x - lead(.x), .names = "diff_{.col}"))
if the values are numeric and the column names are not easily found, you can use mutate(across(where(is.numeric), ~.x - lead(.x), .names = "diff_{.col}")).
# A tibble: 8 × 7
# Groups: Date [3]
Date Value1 Value2 Value3 diff_Value1 diff_Value2 diff_Value3
<chr> <int> <int> <int> <int> <int> <int>
1 1997Q1 100 130 120 0 0 -4
2 1997Q1 100 130 124 -20 -6 -30
3 1997Q1 120 136 154 NA NA NA
4 1997Q2 180 145 154 -6 11 28
5 1997Q2 186 134 126 0 10 -50
6 1997Q2 186 124 176 NA NA NA
7 1997Q3 190 143 176 -2 0 53
8 1997Q3 192 143 123 NA NA NA
data:
df1 <- structure(list(Date = c("1997Q1", "1997Q1", "1997Q1", "1997Q2",
"1997Q2", "1997Q2", "1997Q3", "1997Q3"), Value1 = c(100L, 100L,
120L, 180L, 186L, 186L, 190L, 192L), Value2 = c(130L, 130L, 136L,
145L, 134L, 124L, 143L, 143L), Value3 = c(120L, 124L, 154L, 154L,
126L, 176L, 176L, 123L)), class = "data.frame", row.names = c(NA,
-8L))
I have previously posted a question on subsetting columns from row values on GIS StackExchange: here.
In short, I would like to set data to NA, if the column name (e.g. 100) is less than the row value of s_mean (e.g. value is 101).
It worked for specific applications but now it does not work, and I get the following error:
Error: Can't subset columns that don't exist.
x Locations 304, 303, 302, 301, 300, etc. don't exist.
i There are only 197 columns.
Run `rlang::last_error()` to see where the error occurred.
Here is the data:
# A tibble: 2,937 x 197
ID doy FireID Year sE NAME L1NAME ID_2 area s_count s_mean s_median s_stdev s_min doydiff ID_E5 32 33 34 35
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2246 173 30048 2015 0 A T 30048 3.86e6 0 100 0 0 0 73 56 267. 265. 264. 265.
2 2275 174 30076 2015 0 A T 30076 2.15e6 0 100 0 0 0 74 533 266. 266. 263. 264.
3 704 294 28542 2015 1381 A T 28542 6.44e5 0 100 0 0 0 194 562 277. 277. 278. 279.
4 711 110 28549 2015 0 NA NA 28549 2.15e5 0 101 0 0 0 9 569 262. 264. 260. 262.
5 690 161 28528 2015 232 A T 28528 4.29e5 0 101 0 0 0 60 580 280. 279. 280. 279.
6 692 331 28530 2015 0 M M 28530 2.15e5 0 101 0 0 0 130 582 280. 279. 281. 280.
7 667 47 28506 2015 232 M M 28506 2.79e6 0 10 0 0 0 37 589 280. 282. 281. 280.
8 672 188 28511 2015 0 NA NA 28511 2.79e6 0 101 0 0 0 87 594 254. 261. 259. 254.
9 657 171 28496 2015 578 NA NA 28496 8.59e5 0 101 0 0 0 170 611 256. 263. 260. 254.
10 635 301 28474 2015 1084 M M 28474 1.50e6 0 101 0 0 0 200 621 282. 282. 282. 281.
The data columns continue until columns name 212. It is not shown here.
Here is the script:
polydata = read_csv("path/E15.csv")
polydata$s_mean <- round(polydata$s_mean)
polydata <- polydata[order(polydata$s_mean),]
# slice each row, and put each slice in a list
df_sub = lapply(1:nrow(polydata),
function(x){
polydata[x,c(1,10,polydata$s_mean[x]:187+10)] # + 10 because of the offset: doy_columns start at 11
})
Why do I get an error that I return too many columns when I specify 187+10 as the subsetting parameter?
What should be changed?
I eventually want this to be the outcome (compare the column names to s_mean to better understand the desired output):
ID s_mean 32 33 34 35 36 ... 212
1 30 267 278 270 269 267 ... 298
2 100 NA NA NA NA NA ... 298
3 35 NA NA NA 242 246 ... 298
We can use across from dplyr and refer to column names using cur_column. From there, we can use an ifelse to replace the data with NA if the column name is less than s_mean. I created a toy dataset to illustrate the solution which can be found at the end of this post.
library(dplyr)
pdat1 %>%
mutate(across(`32`:`35`,
~ifelse(s_mean > as.numeric(cur_column()), NA, .)))
#> ID s_mean 32 33 34 35
#> 1 2246 30 267 265 264 265
#> 2 2275 100 NA NA NA NA
#> 3 704 100 NA NA NA NA
#> 4 711 34 NA NA 260 262
#> 5 690 101 NA NA NA NA
#> 6 692 101 NA NA NA NA
#> 7 667 10 280 282 281 280
#> 8 672 101 NA NA NA NA
#> 9 657 101 NA NA NA NA
#> 10 635 101 NA NA NA NA
Toy Dataset:
pdat1 <- structure(list(ID = c(2246L, 2275L, 704L, 711L, 690L, 692L, 667L, 672L,
657L, 635L),
s_mean = c(30L, 100L, 100L, 34L, 101L, 101L, 10L, 101L,
101L, 101L),
`32` = c(267, 266, 277, 262, 280, 280, 280, 254, 256, 282),
`33` = c(265, 266, 277, 264, 279, 279, 282, 261, 263, 282),
`34` = c(264, 263, 278, 260, 280, 281, 281, 259, 260, 282),
`35` = c(265, 264, 279, 262, 279, 280, 280, 254, 254, 281)),
class = "data.frame",
row.names = c("1", "2", "3", "4","5", "6", "7", "8", "9", "10"))
#> ID s_mean 32 33 34 35
#> 1 2246 30 267 265 264 265
#> 2 2275 100 266 266 263 264
#> 3 704 100 277 277 278 279
#> 4 711 34 262 264 260 262
#> 5 690 101 280 279 280 279
#> 6 692 101 280 279 281 280
#> 7 667 10 280 282 281 280
#> 8 672 101 254 261 259 254
#> 9 657 101 256 263 260 254
#> 10 635 101 282 282 282 281
So i need to merge 2 data frames:
The first data frame contains dates in YYYY-mm-dd format and event lengths:
datetime length
2003-06-03 1
2003-06-07 1
2003-06-13 1
2003-06-17 3
2003-06-28 5
2003-07-10 1
2003-07-23 1
...
The second data frame contains dates in the same format and discharge data:
datetime q
2003-05-29 36.2
2003-05-30 34.6
2003-05-31 33.1
2003-06-01 30.7
2003-06-02 30.0
2003-06-03 153.0
2003-06-04 69.0
...
The second data frame is much larger.
I want to merge/join only the following rows of the second data frame to the first:
all rows that have the same date as the first frame (I know this can be done with left_join(df1,df2, by = c("datetime"))
two rows before that row
n-1 rows after that row, where n = "length" value of row in first data frame.
I would like to identify the rows belonging to the same event as well.
Ideally i would have the following output: (Notice the event from 2003-06-17)
EventDatesNancy length q event#
2003-06-03 1 153.0 1
2003-06-07 1 120.0 2
2003-06-13 1 45.3 3
2003-06-15 na 110.0 4
2003-06-16 na 53.1 4
2003-06-17 3 78.0 4
2003-06-18 na 167.0 4
2003-06-19 na 145.0 4
...
I hope this makes clear what I am trying to do.
This might be one approach using tidyverse and fuzzyjoin.
First, indicate event numbers in your first data.frame. Add two columns to indicate the start and end dates (start date is 2 days before the date, and end date is length days - 1 after the date).
Then, you can use fuzzy_inner_join to get the selected rows from the second data.frame. Here, you will want to include where the datetime in the second data.frame falls after the start date and before the end date of the first data.frame.
library(tidyverse)
library(fuzzyjoin)
df1$event <- seq_along(1:nrow(df1))
df1$start_date <- df1$datetime - 2
df1$end_date <- df1$datetime + df1$length - 1
fuzzy_inner_join(
df1,
df2,
by = c("start_date" = "datetime", "end_date" = "datetime"),
match_fun = c(`<=`, `>=`)
) %>%
select(datetime.y, length, q, event)
I tried this out with some made up data:
R> df1
datetime length
1 2003-06-03 1
2 2003-06-12 1
3 2003-06-21 1
4 2003-06-30 3
5 2003-07-09 5
6 2003-07-18 1
7 2003-07-27 1
8 2003-08-05 2
9 2003-08-14 1
10 2003-08-23 1
11 2003-09-01 3
R> df2
datetime q
1 2003-06-03 44
2 2003-06-04 52
3 2003-06-05 34
4 2003-06-06 20
5 2003-06-07 57
6 2003-06-08 67
7 2003-06-09 63
8 2003-06-10 51
9 2003-06-11 56
10 2003-06-12 37
11 2003-06-13 16
12 2003-06-14 54
13 2003-06-15 46
14 2003-06-16 6
15 2003-06-17 32
16 2003-06-18 91
17 2003-06-19 61
18 2003-06-20 42
19 2003-06-21 28
20 2003-06-22 98
21 2003-06-23 77
22 2003-06-24 81
23 2003-06-25 13
24 2003-06-26 15
25 2003-06-27 73
26 2003-06-28 38
27 2003-06-29 27
28 2003-06-30 49
29 2003-07-01 10
30 2003-07-02 89
31 2003-07-03 9
32 2003-07-04 80
33 2003-07-05 68
34 2003-07-06 26
35 2003-07-07 31
36 2003-07-08 29
37 2003-07-09 84
38 2003-07-10 60
39 2003-07-11 19
40 2003-07-12 97
41 2003-07-13 35
42 2003-07-14 47
43 2003-07-15 70
This will give the following output:
datetime.y length q event
1 2003-06-03 1 44 1
2 2003-06-10 1 51 2
3 2003-06-11 1 56 2
4 2003-06-12 1 37 2
5 2003-06-19 1 61 3
6 2003-06-20 1 42 3
7 2003-06-21 1 28 3
8 2003-06-28 3 38 4
9 2003-06-29 3 27 4
10 2003-06-30 3 49 4
11 2003-07-01 3 10 4
12 2003-07-02 3 89 4
13 2003-07-07 5 31 5
14 2003-07-08 5 29 5
15 2003-07-09 5 84 5
16 2003-07-10 5 60 5
17 2003-07-11 5 19 5
18 2003-07-12 5 97 5
19 2003-07-13 5 35 5
If the output desired is different than above, please let me know what should be different so that I can correct it.
Data
df1 <- structure(list(datetime = structure(c(12206, 12215, 12224, 12233,
12242, 12251, 12260, 12269, 12278, 12287, 12296), class = "Date"),
length = c(1, 1, 1, 3, 5, 1, 1, 2, 1, 1, 3), event = 1:11,
start_date = structure(c(12204, 12213, 12222, 12231, 12240,
12249, 12258, 12267, 12276, 12285, 12294), class = "Date"),
end_date = structure(c(12206, 12215, 12224, 12235, 12246,
12251, 12260, 12270, 12278, 12287, 12298), class = "Date")), row.names = c(NA,
-11L), class = "data.frame")
df2 <- structure(list(datetime = structure(c(12206, 12207, 12208, 12209,
12210, 12211, 12212, 12213, 12214, 12215, 12216, 12217, 12218,
12219, 12220, 12221, 12222, 12223, 12224, 12225, 12226, 12227,
12228, 12229, 12230, 12231, 12232, 12233, 12234, 12235, 12236,
12237, 12238, 12239, 12240, 12241, 12242, 12243, 12244, 12245,
12246, 12247, 12248), class = "Date"), q = c(44L, 52L, 34L, 20L,
57L, 67L, 63L, 51L, 56L, 37L, 16L, 54L, 46L, 6L, 32L, 91L, 61L,
42L, 28L, 98L, 77L, 81L, 13L, 15L, 73L, 38L, 27L, 49L, 10L, 89L,
9L, 80L, 68L, 26L, 31L, 29L, 84L, 60L, 19L, 97L, 35L, 47L, 70L
)), class = "data.frame", row.names = c(NA, -43L))
I have a data set that contains some missing values which can be completed by merging with a another dataset. My example:
This is the updated data set I am working with.
DF1
Name Paper Book Mug soap computer tablet coffee coupons
1 2 3 4 5 6 7 8 9
2 21 22 23 23 23 7 23 9
3 56 57 58 59 60 7 62 9
4 80.33333 81.33333 82.33333 83 83.66667 7 85 9
5 107.3333 108.3333 109.3333 110 110.6667 7 112 9
6 134.3333 135.3333 136.3333 137 137.6667 7 139 9
7 161.3333 162.3333 163.3333 164 164.6667
8 188.3333 189.3333 190.3333 191 191.6667 7 193 9
9 215.3333 216.3333 217.3333 218 218.6667 7 220 9
10 242.3333 243.3333 244.3333 245 245.6667 7 247 9
11 269.3333 270.3333 271.3333 272 272.6667 7 274 9
12 296.3333 297.3333 298.3333 299 299.6667
13 323.3333 324.3333 325.3333 326 326.6667 7 328 9
14 350.3333 351.3333 352.3333 353 353.6667 7 355 9
15 377.3333 378.3333 379.3333 380 380.6667
16 404.3333 405.3333 406.3333 407 407.6667 7 409 9
17 431.3333 432.3333 433.3333 434 434.6667 7 436 9
18 458.3333 459.3333 460.3333 461 461.6667 7 463 9
19 485.3333 486.3333 487.3333 488 488.6667
DF2
Name Paper Book Mug soap computer tablet coffee coupons
7 161.3333 162.3333 163.3333 164 164.6667 6 6 6
12 296.3333 297.3333 298.3333 299 299.6667 88 96 25
15 377.3333 378.3333 379.3333 380 380.6667 88 62 25
19 485.3333 486.3333 487.3333 488 488.6667 88 88 78
I want to get:
Name Paper Book Mug soap computer tablet coffee coupons
1 2 3 4 5 6 7 8 9
2 21 22 23 23 23 7 23 9
3 56 57 58 59 60 7 62 9
4 80.33333 81.33333 82.33333 83 83.66667 7 85 9
5 107.3333 108.3333 109.3333 110 110.6667 7 112 9
6 134.3333 135.3333 136.3333 137 137.6667 7 139 9
7 161.3333 162.3333 163.3333 164 164.6667 6 6 6
8 188.3333 189.3333 190.3333 191 191.6667 7 193 9
9 215.3333 216.3333 217.3333 218 218.6667 7 220 9
10 242.3333 243.3333 244.3333 245 245.6667 7 247 9
11 269.3333 270.3333 271.3333 272 272.6667 7 274 9
12 296.3333 297.3333 298.3333 299 299.6667 88 96 25
13 323.3333 324.3333 325.3333 326 326.6667 7 328 9
14 350.3333 351.3333 352.3333 353 353.6667 7 355 9
15 377.3333 378.3333 379.3333 380 380.6667 88 62 25
16 404.3333 405.3333 406.3333 407 407.6667 7 409 9
17 431.3333 432.3333 433.3333 434 434.6667 7 436 9
18 458.3333 459.3333 460.3333 461 461.6667 7 463 9
19 485.3333 486.3333 487.3333 488 488.6667 88 88 78
I have tried the following code:
DF1[,c(4:6)][is.na(DF1[,c(4:6)]<-DF2[,c(2:4)][match(DF1[,1],DF2[,1])]
[which(is.na(DF1[,c(4:6)]))]
One of the solutions using dplyr will work, if I omit the columns which are already complete. Not sure if it my version of dplyr, which I have updated last week.
Any help is greatly appreciated! Thanks!
We can do a left join and then coalesce the columns
library(dplyr)
DF1 %>%
left_join(DF2, by = c('NameVar')) %>%
transmute(NameVar, Var1, Var2,
Var3 = coalesce(Var3.x, Var3.y),
Var4 = coalesce(Var4.x, Var4.y),
Var5 = coalesce(Var5.x, Var5.y))
-output
# NameVar Var1 Var2 Var3 Var4 Var5
#1 Sub1 30 45 40 34 65
#2 Sub2 25 30 30 45 45
#3 Sub3 74 34 25 30 49
#4 Sub4 30 45 40 34 65
#5 Sub5 25 30 69 56 72
#6 Sub6 74 34 74 34 60
Or using data.table
library(data.table)
nm1 <- setdiff(intersect(names(DF1), names(DF2)), 'NameVar')
setDT(DF1)[DF2, (nm1) := Map(fcoalesce, mget(nm1),
mget(paste0("i.", nm1))), on = .(NameVar)]
data
DF1 <- structure(list(NameVar = c("Sub1", "Sub2", "Sub3", "Sub4", "Sub5",
"Sub6"), Var1 = c(30L, 25L, 74L, 30L, 25L, 74L), Var2 = c(45L,
30L, 34L, 45L, 30L, 34L), Var3 = c(40L, NA, NA, 40L, 69L, NA),
Var4 = c(34L, NA, NA, 34L, 56L, NA), Var5 = c(65L, NA, NA,
65L, 72L, NA)), class = "data.frame", row.names = c(NA, -6L
))
DF2 <- structure(list(NameVar = c("Sub2", "Sub3", "Sub6"), Var3 = c(30L,
25L, 74L), Var4 = c(45L, 30L, 34L), Var5 = c(45L, 49L, 60L)),
class = "data.frame", row.names = c(NA,
-3L))
I tried all the similar posts but none of the answers seemed to work for me. I want to delete 8500+ rows (by rowname only) from a dataframe with 27,000+. The other columns are completely different, but the smaller dataset was derived from the larger one, and just looking for names shows me that whatever I look for from smaller df it is present in larger df. I could of course do this manually (busy work for sure!), but seems like there should be a simple computational answer.
I have tried:
fordel<-df2[1,]
df3<-df1[!rownames(df1) %in% fordel
l1<- as.vector(df2[1,])
df3<- df1[1-c(l1),]
and lots of other crazy ideas!
Here is a smallish example: df1:
Ent_gene_id clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
ENSMUSG00000000001.4 10634 6954 6835 6510
ENSMUSG00000000003.15 0 0 0 0
ENSMUSG00000000028.14 559 1570 807 1171
ENSMUSG00000000031.15 5748 174 4103 146
ENSMUSG00000000037.16 37 194 49 96
ENSMUSG00000000049.11 0 3 1 0
ENSMUSG00000000056.7 1157 1125 806 947
ENSMUSG00000000058.6 75 304 123 169
ENSMUSG00000000078.6 4012 4391 5637 3854
ENSMUSG00000000085.16 381 560 482 368
ENSMUSG00000000088.6 2667 4777 3483 3450
ENSMUSG00000000093.6 3 48 41 22
ENSMUSG00000000094.12 23 201 102 192
df2
structure(list(base_mean = c(7962.408875, 947.1240794, 43.76698418 ), log2foldchange = c(-0.363434063, -0.137403759, -0.236463207 ), lfcSE = c(0.096816743, 0.059823215, 0.404929452), stat = c(-3.753834854, -2.296830066, -0.583961493)), row.names = c("ENSMUSG00000000001.4", "ENSMUSG00000000056.7", "ENSMUSG00000000093.6"), class = "data.frame")
I want to delete from df1 the rows corresponding to the rownames in df2.
Tried to format it, but seems no longer formatted... oh well....
Suggestions really appreciated!
You mentioned row names but your data does not include that, so I'll assume that they really don't matter (or exist). Also, your df2 has more column headers than columns, not sure what's going on there ... so I'll ignore it.
Data
df1 <- structure(list(Ent_gene_id = c("ENSMUSG00000000001.4", "ENSMUSG00000000003.15",
"ENSMUSG00000000028.14", "ENSMUSG00000000031.15", "ENSMUSG00000000037.16",
"ENSMUSG00000000049.11", "ENSMUSG00000000056.7", "ENSMUSG00000000058.6",
"ENSMUSG00000000078.6", "ENSMUSG00000000085.16", "ENSMUSG00000000088.6",
"ENSMUSG00000000093.6", "ENSMUSG00000000094.12"), clone57_RNA = c(10634L,
0L, 559L, 5748L, 37L, 0L, 1157L, 75L, 4012L, 381L, 2667L, 3L,
23L), clone43_RNA_2 = c(6954L, 0L, 1570L, 174L, 194L, 3L, 1125L,
304L, 4391L, 560L, 4777L, 48L, 201L), clone67_RNA = c(6835L,
0L, 807L, 4103L, 49L, 1L, 806L, 123L, 5637L, 482L, 3483L, 41L,
102L), clone55_RNA = c(6510L, 0L, 1171L, 146L, 96L, 0L, 947L,
169L, 3854L, 368L, 3450L, 22L, 192L)), class = "data.frame", row.names = c(NA,
-13L))
df2 <- structure(list(Ent_gene_id = c("ENSMUSG00000000001.4", "ENSMUSG00000000056.7",
"ENSMUSG00000000093.6"), base_mean = c(7962.408875, 947.1240794,
43.76698418), log2foldchange = c(-0.36343406, -0.137403759, -0.236463207
), pvalue = c(0.00017415, 0.021628466, 0.55924622)), class = "data.frame", row.names = c(NA,
-3L))
Base
df1[!df1$Ent_gene_id %in% df2$Ent_gene_id,]
# Ent_gene_id clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
# 2 ENSMUSG00000000003.15 0 0 0 0
# 3 ENSMUSG00000000028.14 559 1570 807 1171
# 4 ENSMUSG00000000031.15 5748 174 4103 146
# 5 ENSMUSG00000000037.16 37 194 49 96
# 6 ENSMUSG00000000049.11 0 3 1 0
# 8 ENSMUSG00000000058.6 75 304 123 169
# 9 ENSMUSG00000000078.6 4012 4391 5637 3854
# 10 ENSMUSG00000000085.16 381 560 482 368
# 11 ENSMUSG00000000088.6 2667 4777 3483 3450
# 13 ENSMUSG00000000094.12 23 201 102 192
dplyr
dplyr::anti_join(df1, df2, by = "Ent_gene_id")
# Ent_gene_id clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
# 1 ENSMUSG00000000003.15 0 0 0 0
# 2 ENSMUSG00000000028.14 559 1570 807 1171
# 3 ENSMUSG00000000031.15 5748 174 4103 146
# 4 ENSMUSG00000000037.16 37 194 49 96
# 5 ENSMUSG00000000049.11 0 3 1 0
# 6 ENSMUSG00000000058.6 75 304 123 169
# 7 ENSMUSG00000000078.6 4012 4391 5637 3854
# 8 ENSMUSG00000000085.16 381 560 482 368
# 9 ENSMUSG00000000088.6 2667 4777 3483 3450
# 10 ENSMUSG00000000094.12 23 201 102 192
Edit: same thing but with row names:
# update my df1 to change Ent_gene_id from a column to rownames
rownames(df1) <- df1$Ent_gene_id
df1$Ent_gene_id <- NULL
# use your updated df2 (from dput)
# df2 <- structure(...)
df1[ !rownames(df1) %in% rownames(df2), ]
# clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
# ENSMUSG00000000003.15 0 0 0 0
# ENSMUSG00000000028.14 559 1570 807 1171
# ENSMUSG00000000031.15 5748 174 4103 146
# ENSMUSG00000000037.16 37 194 49 96
# ENSMUSG00000000049.11 0 3 1 0
# ENSMUSG00000000058.6 75 304 123 169
# ENSMUSG00000000078.6 4012 4391 5637 3854
# ENSMUSG00000000085.16 381 560 482 368
# ENSMUSG00000000088.6 2667 4777 3483 3450
# ENSMUSG00000000094.12 23 201 102 192