How perform rbind fucntion by prefix using R - r

suppose i have dataset
dat=structure(list(X0_B02 = c(108L, 280L, 362L, 918L, 141L), X0_B03 = c(398L,
733L, 725L, 1116L, 421L), X0_B04 = c(214L, 464L, 692L, 1394L,
238L), X1_B02 = c(480L, 484L, 544L, 911L, 380L), X1_B03 = c(760L,
856L, 849L, 1110L, 611L), X1_B04 = c(546L, 599L, 737L, 1348L,
431L)), class = "data.frame", row.names = c(NA, -5L))
0_, 1_ these are prefixes to variables. 0_ means zero day, 1_ means the first day, 2_ will mean the second day of observations for variables bo2, bo3, bo4.
I need to make that the variables for days will not in columns but in rows, with indicating the day. Like this.
day B02 B03 B04
0 108 398 214
0 280 733 464
0 362 725 692
0 918 1116 1394
0 141 421 238
1 480 760 546
1 484 856 599
1 544 849 737
1 911 1110 1348
1 380 611 431
is the way using rbind fucntion perform it or to get desired output we need another way?
Thank you.

Using tidyverse, you can use pivot_longer with a names_pattern. This will allow the number after "X" to be used for the new day column, and the word after underscore for the other column names.
library(tidyr)
pivot_longer(
dat,
cols = everything(),
names_to = c("day", ".value"),
names_pattern = "X(\\d+)_(\\w+)"
)
Output
day B02 B03 B04
<chr> <int> <int> <int>
1 0 108 398 214
2 1 480 760 546
3 0 280 733 464
4 1 484 856 599
5 0 362 725 692
6 1 544 849 737
7 0 918 1116 1394
8 1 911 1110 1348
9 0 141 421 238
10 1 380 611 431

Related

transform data from wide to long in r [duplicate]

This question already has answers here:
Transforming wide data to long format with multiple variables
(3 answers)
Closed 3 months ago.
I have a dataset similar to this:
id_male<-rep(c(1:100),times=1)
df1<-data.frame(id_male)
df1$id_female <- sample(500:1000, length(df1$id_male), replace=TRUE)
df1$var_ma_1 <- sample(500:1000, length(df1$id_male))
df1$var_ma_2 <- sample(500:1000, length(df1$id_male))
df1$var_ma_3 <- sample(500:1000, length(df1$id_male))
df1$var_ma_4 <- sample(500:1000, length(df1$id_male))
df1$diff_1 <- sample(900:1000, length(df1$id_male))
df1$diff_2 <- sample(800:1000, length(df1$id_male))
df1$diff_3 <- sample(100:1000, length(df1$id_male))
df1$diff_4 <- sample(200:1000, length(df1$id_male))
df1$var <- sample(0:1, length(df1$id_male), replace=TRUE)
id_male id_female var_ma_1 var_ma_2 var_ma_3 var_ma_4 diff_1 diff_2 diff_3 diff_4 var
1 522 839 996 866 661 904 888 333 701 1
2 548 706 967 855 682 974 838 257 572 0
3 799 580 903 671 636 911 889 104 225 1
4 518 745 565 719 707 959 983 465 239 0
5 769 719 760 951 687 943 997 474 562 0
6 745 642 1000 926 659 950 859 168 849 0
and I wanted to reshape this data to a long format:
id_male id_female visit var_ma diff var
1 522 1 839 904 1
1 522 2 996 888 1
1 522 3 866 333 1
1 522 4 661 701 1
2 548 1 967 974 0
2 548 2 855 838 0
2 548 3 682 257 0
2 548 4 947 572 0
I tried pivot_wider, reshape, and melt, but can't get it exactly what I want. Thank you in advance for your help!
Since part of some column names are kept as columns and part as a value in a new column, I think we can use names_pattern to separate out the number from the column name to form visit, and keep the rest.
tidyr::pivot_longer(df1, -c(id_male, id_female, var),
names_pattern = "(.*)_([0-9]+)$",
names_to = c(".value", "visit"))
# # A tibble: 24 x 6
# id_male id_female var visit var_ma diff
# <int> <int> <int> <chr> <int> <int>
# 1 1 522 1 1 839 904
# 2 1 522 1 2 996 888
# 3 1 522 1 3 866 333
# 4 1 522 1 4 661 701
# 5 2 548 0 1 706 974
# 6 2 548 0 2 967 838
# 7 2 548 0 3 855 257
# 8 2 548 0 4 682 572
# 9 3 799 1 1 580 911
# 10 3 799 1 2 903 889
# # ... with 14 more rows
If you're unfamiliar with regex,
(.*)_([0-9]+)$
^^ 0 or more of anything
^^^^ parens capture this as the first "group"
^ the literal underscore character
^^^^^^ 1 or more digit (0-9 only)
^^^^^^^^ parens capture this as the second "group"
^ end of string
The two "groups" correspond to the names_to argument.
Data
df1 <- structure(list(id_male = 1:6, id_female = c(522L, 548L, 799L, 518L, 769L, 745L), var_ma_1 = c(839L, 706L, 580L, 745L, 719L, 642L), var_ma_2 = c(996L, 967L, 903L, 565L, 760L, 1000L), var_ma_3 = c(866L, 855L, 671L, 719L, 951L, 926L), var_ma_4 = c(661L, 682L, 636L, 707L, 687L, 659L), diff_1 = c(904L, 974L, 911L, 959L, 943L, 950L), diff_2 = c(888L, 838L, 889L, 983L, 997L, 859L), diff_3 = c(333L, 257L, 104L, 465L, 474L, 168L), diff_4 = c(701L, 572L, 225L, 239L, 562L, 849L), var = c(1L, 0L, 1L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -6L))

Creating differences in a new column for certain dates in R

i have a data frame that looks like this;
Date Value1 Value 2 Value 3
1997Q1 100 130 120
1997Q1 100 130 124
1997Q1 120 136 154
1997Q2 180 145 154
1997Q2 186 134 126
1997Q2 186 124 176
1997Q3 190 143 176
1997Q3 192 143 123
I would like to calculate differences for each values within the same date, for example the differences in value 1 column for 1997q1, then 1997q2 and so on.
I would like these differences to be shown in a new column, so that the results would look something like this;
Date Value1 Value 2 Value 3 Diff Val 1 Diff Val 2 Diff Val 3
1997Q1 100 130 120 0 0 4
1997Q1 100 130 124 20 6 30
1997Q1 120 136 154 N/A N/A N/A
1997Q2 180 145 154 6 -11 -28
1997Q2 186 134 126 0 10 50
1997Q2 186 124 176 N/A N/A N/A
1997Q3 190 143 176 2 0 -53
1997Q3 192 143 123
You can use dplyr functions for this. The ~ .x - lead(.x) is the function applied to every value column, selected with starts_with. we take the current value minus the next value. If you need lag, switch it around, ~ lag(.x) - .x
library(dplyr)
df1 %>%
group_by(Date) %>%
mutate(across(starts_with("Value"), ~.x - lead(.x), .names = "diff_{.col}"))
if the values are numeric and the column names are not easily found, you can use mutate(across(where(is.numeric), ~.x - lead(.x), .names = "diff_{.col}")).
# A tibble: 8 × 7
# Groups: Date [3]
Date Value1 Value2 Value3 diff_Value1 diff_Value2 diff_Value3
<chr> <int> <int> <int> <int> <int> <int>
1 1997Q1 100 130 120 0 0 -4
2 1997Q1 100 130 124 -20 -6 -30
3 1997Q1 120 136 154 NA NA NA
4 1997Q2 180 145 154 -6 11 28
5 1997Q2 186 134 126 0 10 -50
6 1997Q2 186 124 176 NA NA NA
7 1997Q3 190 143 176 -2 0 53
8 1997Q3 192 143 123 NA NA NA
data:
df1 <- structure(list(Date = c("1997Q1", "1997Q1", "1997Q1", "1997Q2",
"1997Q2", "1997Q2", "1997Q3", "1997Q3"), Value1 = c(100L, 100L,
120L, 180L, 186L, 186L, 190L, 192L), Value2 = c(130L, 130L, 136L,
145L, 134L, 124L, 143L, 143L), Value3 = c(120L, 124L, 154L, 154L,
126L, 176L, 176L, 123L)), class = "data.frame", row.names = c(NA,
-8L))

Comparing between dataframes to find numbers +/- 1 of the observation in the same position in both dataframes

I have two dataframes at time point 1 and time point 2. Each Dataframe has a column "midpoint" and i want to compare the midpoint in dataframe 2 (time point 2) to the midpoint dataframe 1 (time point 1) such that if it is within +/- 1, a unique "id" is assigned in a column called "id" for each comparison that returns TRUE under the above parameters. If its false, then the id should be blank or 0. Ive been playing around with the ifelse() function with little success so far. Ive been trying to create a function so that the dataframe time point n, compares to the previous time point (n-1).
I will eventually use the purrr package to loop it for every timepoint (total of 130ish), for some context behind why im doing this.
Ignore the maximum and minimum column, these are relevant for something different, appreciate any help possible!
Dataframe 1 (time point 1)
structure(list(Object = c(2666L, 2668L, 2671L, 2674L, 2676L,
2677L, 2678L, 2679L, 2680L, 2682L, 2683L, 2684L, 2685L, 2686L,
2687L, 2689L, 2692L, 2693L, 2694L, 2695L, 2696L), minimum = c(4L,
39L, 147L, 224L, 419L, 531L, 595L, 641L, 669L, 723L, 810L, 836L,
907L, 978L, 1061L, 1129L, 1290L, 1519L, 1749L, 1843L, 1897L),
maximum = c(22L, 85L, 173L, 242L, 449L, 587L, 627L, 655L,
702L, 740L, 828L, 890L, 923L, 1024L, 1086L, 1144L, 1302L,
1544L, 1780L, 1870L, 1925L), midpoint = c(13, 62, 160, 233,
434, 559, 611, 648, 685.5, 731.5, 819, 863, 915, 1001, 1073.5,
1136.5, 1296, 1531.5, 1764.5, 1856.5, 1911)), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
Dataframe 2 (time point 2)
structure(list(Object = c(2645L, 2646L, 2650L, 2652L, 2655L,
2656L, 2657L, 2658L, 2659L, 2661L, 2662L, 2663L, 2664L, 2665L,
2667L, 2670L, 2675L, 2681L, 2688L, 2690L, 2691L), minimum = c(4L,
40L, 147L, 224L, 415L, 532L, 595L, 641L, 670L, 722L, 811L, 835L,
907L, 978L, 1061L, 1128L, 1289L, 1520L, 1748L, 1843L, 1897L),
maximum = c(22L, 85L, 173L, 242L, 445L, 588L, 627L, 655L,
702L, 739L, 828L, 891L, 923L, 1022L, 1085L, 1143L, 1302L,
1544L, 1779L, 1870L, 1925L), midpoint = c(13, 62.5, 160,
233, 430, 560, 611, 648, 686, 730.5, 819.5, 863, 915, 1000,
1073, 1135.5, 1295.5, 1532, 1763.5, 1856.5, 1911)), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
Expected output:
object minimum maximum midpoint id
2645 4 22 13 1
2646 40 85 62.5 2
2650 147 173 260 3
So the output is an additional column to dataframe 2, with a unique ID for each instance where midpoint in observation 1 (in df2) is within +/- 1 to observation 1 (in df1). As i want to compare to the n-1th dataframe because it represents the previous timepoint.
You can subset df2on those rows in which df2$midpoint is within the desired range of df1$midpoint, store that subsetted dataframe as a new dataframe and add an idcolumn to it:
df2new <- df2[df2$midpoint >= df1$midpoint - 1 & df2$midpoint <= df2$midpoint + 1, ]
df2new$id <- 1:nrow(df2new)
df2new
# A tibble: 20 x 5
Object minimum maximum midpoint id
<int> <int> <int> <dbl> <int>
1 2645 4 22 13 1
2 2646 40 85 62.5 2
3 2650 147 173 160 3
4 2652 224 242 233 4
5 2656 532 588 560 5
6 2657 595 627 611 6
7 2658 641 655 648 7
8 2659 670 702 686 8
9 2661 722 739 730. 9
10 2662 811 828 820. 10
11 2663 835 891 863 11
12 2664 907 923 915 12
13 2665 978 1022 1000 13
14 2667 1061 1085 1073 14
15 2670 1128 1143 1136. 15
16 2675 1289 1302 1296. 16
17 2681 1520 1544 1532 17
18 2688 1748 1779 1764. 18
19 2690 1843 1870 1856. 19
20 2691 1897 1925 1911 20
Alternatively, if you wanted to keep df2as it is but 'flag' those rows that fall into the desired range with 1and those that don't with 0, you could do this:
df2$id <-ifelse(df2$midpoint >= df1$midpoint - 1 & df2$midpoint <= df2$midpoint + 1, 1, 0)
df2
# A tibble: 21 x 5
Object minimum maximum midpoint id
<int> <int> <int> <dbl> <dbl>
1 2645 4 22 13 1
2 2646 40 85 62.5 1
3 2650 147 173 160 1
4 2652 224 242 233 1
5 2655 415 445 430 0
6 2656 532 588 560 1
7 2657 595 627 611 1
8 2658 641 655 648 1
9 2659 670 702 686 1
10 2661 722 739 730. 1
# … with 11 more rows
... and if you wnted to have a continuous range of id values that still marks the row outside the range (as it will just be a repetition of the previous id), then use cumsum on id:
df2$id2 <- cumsum(df2$id)
df2$id2[df2$id < 1] <- 0 # keep the `id` value `0`:
to obtain this:
df2
# A tibble: 21 x 6
Object minimum maximum midpoint id id2
<int> <int> <int> <dbl> <dbl> <dbl>
1 2645 4 22 13 1 1
2 2646 40 85 62.5 1 2
3 2650 147 173 160 1 3
4 2652 224 242 233 1 4
5 2655 415 445 430 0 0
6 2656 532 588 560 1 5
7 2657 595 627 611 1 6
8 2658 641 655 648 1 7
9 2659 670 702 686 1 8
10 2661 722 739 730. 1 9
# … with 11 more rows
Since you want to check the points one-by-one, then this is probably what you are looking for,
i1 <- cumsum(-1 <= (df2$midpoint - df1$midpoint) | 1 >= (df2$midpoint - df1$midpoint))
i1[ave(i1, i1, FUN = length) != 1] <- 0
i1
#[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21

Binding two data frames and removing same row numbers in R [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 2 years ago.
I have two data frames of equal number of rows (639) but differing column lengths (DF1: 5, DF2: 2500), the rows in DF1 correspond to the rows in DF2.
Some rows in DF2 will be removed due to several NAs, but I have no information on which ones are removed. cbind() does not allow me to bind the DFs together due to the now differing row lengths. However, I also need the rows to correspond, so if row 47 is removed in DF2, it must also be removed in DF1 upon merge. My assumption is that there can be some workaround with row.names but I am not sure how to execute it. Help would be appreciated. Examples of DFs below:
DF1:
pp trialNo item trialTarget trial
1 pp01 1 M012 script 1
2 pp01 2 BS016 script 2
3 pp01 3 M007 script 3
4 pp01 4 BS010 script 4
5 pp01 5 M006 script 5
6 pp01 6 BS018 script 6
DF2:
V1 V2 V3 V4 V5 V6
1: 764 764 763 763 762 763
2: 714 714 711 708 705 704
3: 872 871 869 867 867 871
4: 730 728 727 724 722 719
5: 789 786 788 790 792 790
6: 922 923 928 933 938 938
And assuming row 3 in DF2 is removed, I would expect this after binding:
pp trialNo item trialTarget trial V1 V2 V3 V4 V5 V6
1 pp01 1 M012 script 1 764 764 763 763 762 763
2 pp01 2 BS016 script 2 714 714 711 708 705 704
4 pp01 4 BS010 script 4 730 728 727 724 722 719
5 pp01 5 M006 script 5 789 786 788 790 792 790
6 pp01 6 BS018 script 6 922 923 928 933 938 938
Thanks in advance.
You could create a row index in each of the dataframe.
df1$row <- 1:nrow(df1)
df2$row <- 1:nrow(df2)
Then remove 3rd row in df2.
df2 <- df2[-3, ]
You can then merge by row column both the dataframes.
merge(df1, df2, by = 'row')
# row pp trialNo item trialTarget trial V1 V2 V3 V4 V5 V6
#1 1 pp01 1 M012 script 1 764 764 763 763 762 763
#2 2 pp01 2 BS016 script 2 714 714 711 708 705 704
#3 4 pp01 4 BS010 script 4 730 728 727 724 722 719
#4 5 pp01 5 M006 script 5 789 786 788 790 792 790
#5 6 pp01 6 BS018 script 6 922 923 928 933 938 938
data
df1 <- structure(list(pp = c("pp01", "pp01", "pp01", "pp01", "pp01",
"pp01"), trialNo = 1:6, item = c("M012", "BS016", "M007", "BS010",
"M006", "BS018"), trialTarget = c("script", "script", "script",
"script", "script", "script"), trial = 1:6, row = 1:6), row.names = c(NA,
-6L), class = "data.frame")
df2 <- structure(list(V1 = c(764L, 714L, 872L, 730L, 789L, 922L), V2 = c(764L,
714L, 871L, 728L, 786L, 923L), V3 = c(763L, 711L, 869L, 727L,
788L, 928L), V4 = c(763L, 708L, 867L, 724L, 790L, 933L), V5 = c(762L,
705L, 867L, 722L, 792L, 938L), V6 = c(763L, 704L, 871L, 719L,
790L, 938L)), class = "data.frame", row.names = c(NA, -6L))

How can I delete "a lot" of rows from a dataframe in r

I tried all the similar posts but none of the answers seemed to work for me. I want to delete 8500+ rows (by rowname only) from a dataframe with 27,000+. The other columns are completely different, but the smaller dataset was derived from the larger one, and just looking for names shows me that whatever I look for from smaller df it is present in larger df. I could of course do this manually (busy work for sure!), but seems like there should be a simple computational answer.
I have tried:
fordel<-df2[1,]
df3<-df1[!rownames(df1) %in% fordel
l1<- as.vector(df2[1,])
df3<- df1[1-c(l1),]
and lots of other crazy ideas!
Here is a smallish example: df1:
Ent_gene_id clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
ENSMUSG00000000001.4 10634 6954 6835 6510
ENSMUSG00000000003.15 0 0 0 0
ENSMUSG00000000028.14 559 1570 807 1171
ENSMUSG00000000031.15 5748 174 4103 146
ENSMUSG00000000037.16 37 194 49 96
ENSMUSG00000000049.11 0 3 1 0
ENSMUSG00000000056.7 1157 1125 806 947
ENSMUSG00000000058.6 75 304 123 169
ENSMUSG00000000078.6 4012 4391 5637 3854
ENSMUSG00000000085.16 381 560 482 368
ENSMUSG00000000088.6 2667 4777 3483 3450
ENSMUSG00000000093.6 3 48 41 22
ENSMUSG00000000094.12 23 201 102 192
df2
structure(list(base_mean = c(7962.408875, 947.1240794, 43.76698418 ), log2foldchange = c(-0.363434063, -0.137403759, -0.236463207 ), lfcSE = c(0.096816743, 0.059823215, 0.404929452), stat = c(-3.753834854, -2.296830066, -0.583961493)), row.names = c("ENSMUSG00000000001.4", "ENSMUSG00000000056.7", "ENSMUSG00000000093.6"), class = "data.frame")
I want to delete from df1 the rows corresponding to the rownames in df2.
Tried to format it, but seems no longer formatted... oh well....
Suggestions really appreciated!
You mentioned row names but your data does not include that, so I'll assume that they really don't matter (or exist). Also, your df2 has more column headers than columns, not sure what's going on there ... so I'll ignore it.
Data
df1 <- structure(list(Ent_gene_id = c("ENSMUSG00000000001.4", "ENSMUSG00000000003.15",
"ENSMUSG00000000028.14", "ENSMUSG00000000031.15", "ENSMUSG00000000037.16",
"ENSMUSG00000000049.11", "ENSMUSG00000000056.7", "ENSMUSG00000000058.6",
"ENSMUSG00000000078.6", "ENSMUSG00000000085.16", "ENSMUSG00000000088.6",
"ENSMUSG00000000093.6", "ENSMUSG00000000094.12"), clone57_RNA = c(10634L,
0L, 559L, 5748L, 37L, 0L, 1157L, 75L, 4012L, 381L, 2667L, 3L,
23L), clone43_RNA_2 = c(6954L, 0L, 1570L, 174L, 194L, 3L, 1125L,
304L, 4391L, 560L, 4777L, 48L, 201L), clone67_RNA = c(6835L,
0L, 807L, 4103L, 49L, 1L, 806L, 123L, 5637L, 482L, 3483L, 41L,
102L), clone55_RNA = c(6510L, 0L, 1171L, 146L, 96L, 0L, 947L,
169L, 3854L, 368L, 3450L, 22L, 192L)), class = "data.frame", row.names = c(NA,
-13L))
df2 <- structure(list(Ent_gene_id = c("ENSMUSG00000000001.4", "ENSMUSG00000000056.7",
"ENSMUSG00000000093.6"), base_mean = c(7962.408875, 947.1240794,
43.76698418), log2foldchange = c(-0.36343406, -0.137403759, -0.236463207
), pvalue = c(0.00017415, 0.021628466, 0.55924622)), class = "data.frame", row.names = c(NA,
-3L))
Base
df1[!df1$Ent_gene_id %in% df2$Ent_gene_id,]
# Ent_gene_id clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
# 2 ENSMUSG00000000003.15 0 0 0 0
# 3 ENSMUSG00000000028.14 559 1570 807 1171
# 4 ENSMUSG00000000031.15 5748 174 4103 146
# 5 ENSMUSG00000000037.16 37 194 49 96
# 6 ENSMUSG00000000049.11 0 3 1 0
# 8 ENSMUSG00000000058.6 75 304 123 169
# 9 ENSMUSG00000000078.6 4012 4391 5637 3854
# 10 ENSMUSG00000000085.16 381 560 482 368
# 11 ENSMUSG00000000088.6 2667 4777 3483 3450
# 13 ENSMUSG00000000094.12 23 201 102 192
dplyr
dplyr::anti_join(df1, df2, by = "Ent_gene_id")
# Ent_gene_id clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
# 1 ENSMUSG00000000003.15 0 0 0 0
# 2 ENSMUSG00000000028.14 559 1570 807 1171
# 3 ENSMUSG00000000031.15 5748 174 4103 146
# 4 ENSMUSG00000000037.16 37 194 49 96
# 5 ENSMUSG00000000049.11 0 3 1 0
# 6 ENSMUSG00000000058.6 75 304 123 169
# 7 ENSMUSG00000000078.6 4012 4391 5637 3854
# 8 ENSMUSG00000000085.16 381 560 482 368
# 9 ENSMUSG00000000088.6 2667 4777 3483 3450
# 10 ENSMUSG00000000094.12 23 201 102 192
Edit: same thing but with row names:
# update my df1 to change Ent_gene_id from a column to rownames
rownames(df1) <- df1$Ent_gene_id
df1$Ent_gene_id <- NULL
# use your updated df2 (from dput)
# df2 <- structure(...)
df1[ !rownames(df1) %in% rownames(df2), ]
# clone57_RNA clone43_RNA_2 clone67_RNA clone55_RNA
# ENSMUSG00000000003.15 0 0 0 0
# ENSMUSG00000000028.14 559 1570 807 1171
# ENSMUSG00000000031.15 5748 174 4103 146
# ENSMUSG00000000037.16 37 194 49 96
# ENSMUSG00000000049.11 0 3 1 0
# ENSMUSG00000000058.6 75 304 123 169
# ENSMUSG00000000078.6 4012 4391 5637 3854
# ENSMUSG00000000085.16 381 560 482 368
# ENSMUSG00000000088.6 2667 4777 3483 3450
# ENSMUSG00000000094.12 23 201 102 192

Resources