I am comparing two data frames: FU and FO
Here are short samples of what they look like
"Model_ID" "FU_Lin_Period" "FU_Growth_rate"
2 0.72127 0.0093333
3 0.69281 0.015857
4 0.66735 0.021103
5 0.64414 0.024205
6 0.62288 0.026568
7 0.60318 0.027749
8 0.58472 0.028161
9 0.56734 0.028008
10 0.55085 0.027309
11 0.53522 0.026068
12 0.52029 0.024684
13 0.50603 0.022866
14 0.49237 0.020991
15 0.47928 0.018773
"Model_ID" "FO_Lin_Period" "FO_Growth_rate"
7 0.44398 0.008868
8 0.43114 0.01674
9 0.41896 0.023248
10 0.40728 0.028641
11 0.39615 0.032192
12 0.38543 0.03543
13 0.37517 0.03692
14 0.36525 0.038427
15 0.35573 0.038195
As you can tell, they do not have all the same Model_ID
Basically, what I want to do is go through every Model_ID in the two tables, compare whether FU or FO's growth rate is larger for a given model ID, and...
if FU's is larger (or FU exists for the model number and FO does not), place the model number in a vector called selected_FU
if FO's is larger (or FO exists for the model number and FU does not), place the model number in a vector called selected_FO
Is there a way to do this without using loops?
data.table alternative using similar logic to the tidyverse answer.
Replace NAs with -Infinity, do the comparison of the two FU/FO_Growth_rate variables, flag which group had the larger value, and select the Model_ID into the variables requested.
library(data.table)
setDT(FU)
setDT(FO)
out <- merge(FU, FO, by="Model_ID", all=TRUE)[,
"gr_sel" := c("FO","FU")[(nafill(FU_Growth_rate, fill=-Inf) >
nafill(FO_Growth_rate, fill=-Inf)) + 1],
]
selected_FU <- out[gr_sel == "FU", Model_ID]
selected_FO <- out[gr_sel == "FO", Model_ID]
Data used:
FU <- read.table(text="Model_ID FU_Lin_Period FU_Growth_rate\n2 0.72127 0.0093333\n3 0.69281 0.015857\n4 0.66735 0.021103\n5 0.64414 0.024205\n6 0.62288 0.026568\n7 0.60318 0.027749\n8 0.58472 0.028161\n9 0.56734 0.028008\n10 0.55085 0.027309\n11 0.53522 0.026068\n12 0.52029 0.024684\n13 0.50603 0.022866\n14 0.49237 0.020991\n15 0.47928 0.018773", header=TRUE)
FO <- read.table(text="Model_ID FO_Lin_Period FO_Growth_rate\n7 0.44398 0.008868\n8 0.43114 0.01674\n9 0.41896 0.023248\n10 0.40728 0.028641\n11 0.39615 0.032192\n12 0.38543 0.03543\n13 0.37517 0.03692\n14 0.36525 0.038427\n15 0.35573 0.038195", header=TRUE)
With dplyr, tidyr, and reader.
library(dplyr)
library(tidyr)
library(readr)
FU <- read_table2("test.FU.LINA.table")
FO <- read_table2("test.FO.LINA.table")
df_compared <-
full_join(FU, FO, by = "model_id") %>%
replace_na(list(fo_growth_rate = -1, fu_growth_rate = -1)) %>%
mutate(select_fufo = if_else(fu_growth_rate >= fo_growth_rate, true = "fu", false = "fo"))
df_compared
# A tibble: 6,166 x 6
model_id fu_lin_period fu_growth_rate fo_lin_period fo_growth_rate select_fufo
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 2 0.721 0.00933 NA -1 fu
2 3 0.693 0.0159 NA -1 fu
3 4 0.667 0.0211 NA -1 fu
4 5 0.644 0.0242 NA -1 fu
5 6 0.623 0.0266 NA -1 fu
6 7 0.603 0.0277 0.444 0.00887 fu
7 8 0.585 0.0282 0.431 0.0167 fu
8 9 0.567 0.0280 0.419 0.0232 fu
9 10 0.551 0.0273 0.407 0.0286 fo
10 11 0.535 0.0261 0.396 0.0322 fo
# ... with 6,156 more rows
selected_fu <- df_compared %>% filter(select_fufo == "fu") %>% .$model_id
selected_fo <- df_compared %>% filter(select_fufo == "fo") %>% .$model_id
Related
I have a data frame like this:
require(dplyr)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
> data
x_1 x_2 x_3 y_1 y_2
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091
2 NA 0.2346621 1.09837343 0.3731092 0.6111779
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394
I need to create a new variable which counts the number of non missing values in each row for the variables starting with "x_". To do that I used mutate and across functions from dplyr.
data=data %>% mutate(sum_no_miss=across(.cols = starts_with("x_"),~ sum(is.na(.x))))
I ran the code without getting error. But I am not getting the ourput that I want. I am getting this.
Would it be possible to tell what I'm doing wrong?
We may use rowSums which is vectorized and efficient compared to rowwise with sum
library(dplyr)
data %>%
mutate(sum_no_miss = rowSums(!is.na(across(starts_with("x_")))))
-output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091 2
2 NA 0.2346621 1.09837343 0.3731092 0.6111779 2
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358 3
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099 3
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381 2
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722 3
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137 3
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816 3
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178 3
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394 3
If we want to use sum, then need rowwise
data %>%
rowwise %>%
mutate(sum_no_miss = sum(!is.na(c_across(starts_with('x_'))))) %>%
ungroup
-output
# A tibble: 10 × 6
x_1 x_2 x_3 y_1 y_2 sum_no_miss
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 NA 0.927 0.294 -1.79 1.66 2
2 NA 0.235 1.10 0.373 0.611 2
3 0.732 -0.558 -0.0852 -2.87 1.15 3
4 -0.947 0.693 -2.67 0.639 -0.511 3
5 1.54 NA 1.33 -0.558 -2.57 2
6 -0.285 -0.953 0.0375 -1.01 0.393 3
7 -1.38 -2.19 -0.0304 0.396 -1.53 3
8 -0.945 0.267 0.652 0.431 -1.53 3
9 -0.902 0.207 0.109 -1.17 -0.489 3
10 -0.974 -0.809 0.641 0.606 0.725 3
In the OP's code, the function sum is used within across and across loops over each column, thus the sum will be the sum of non-NA elements in each column instead of across a row
First, you're close.
Edit: OP fixed this I believe, so not a relevant comment. Second, your variables are coded in a misleading way - you should rename your variable sum_is_miss to something like sum_no_miss if you want readers to understand that you're calculating the number of non-missing values.
Third, this is how you could calculate the number of non-missing entries per row in your dataset first using apply then just adding this column to your dataset:
library(tidyverse)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
sum_no_miss_vec <- apply(data %>% dplyr::select(starts_with("x_")), MARGIN = 1, FUN = function(r){
sum(!is.na(r))
})
data2 <- data %>% mutate(sum_no_miss = sum_no_miss_vec) ;
data2
#> x_1 x_2 x_3 y_1 y_2 sum_no_miss
#> 1 NA 0.74234418 1.06515091 -0.313359946 -0.81266805 2
#> 2 NA 2.13222122 0.78392737 2.109171065 0.69459821 2
#> 3 0.9322299 -0.52545325 0.67377319 2.025281430 0.99975832 3
#> 4 0.9634517 0.38985353 1.20940016 -0.007232240 -1.61104902 3
#> 5 0.4454230 NA 0.02420848 -1.743636503 -0.59597234 2
#> 6 -1.7305822 2.07163152 -0.52849895 0.830802138 -1.40573549 3
#> 7 0.2382603 0.20427098 0.22184048 0.806113977 -0.36726054 3
#> 8 -0.3972436 1.61183785 -0.26835072 0.419459671 -0.05723072 3
#> 9 0.3703195 -0.05354607 -1.19558014 -0.852003930 0.64032424 3
#> 10 0.3003434 -0.82513981 0.19782771 0.001526784 0.89393655 3
Created on 2022-04-15 by the reprex package (v2.0.1)
Here is a base R solution:
As akrun already provided the best answer. Here is a different one using base R:
We use apply with grep to apply the function to specific columns:
data$sum_no_miss <- apply(data[, grep("x_", names(data))], 1, function(x) sum(!is.na(x)))
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA -0.5659449 -1.44792814 0.1659370 0.8040186 2
2 NA 2.6873635 -0.70704189 1.2647756 -0.1238085 2
3 -0.3239291 0.6206436 0.04374401 -0.6476829 1.5228775 3
4 0.7245148 1.6632621 -0.39304104 -0.9305281 1.1328385 3
5 -0.5994830 NA 0.06037891 -1.7654617 0.3073035 2
6 -0.1848746 0.3694963 -1.13622715 0.9252195 0.1072250 3
7 -0.1147132 0.4042102 1.56730477 0.3262673 -0.6369951 3
8 -0.8631230 0.2888508 -2.20030009 -0.9873629 0.2561348 3
9 -0.9384460 -0.8739620 -1.59174131 0.7559146 -1.4229472 3
10 -0.9352575 1.3151532 -0.11439843 -0.5451860 0.9334084 3
Here is a possible data.table solution, using Reduce and lapply on the specific columns using grep.
library(data.table)
dt <- as.data.table(data)
dt[, num_obs := Reduce(`+`, lapply(.SD, \(x) !is.na(x))), .SDcols=grep("x_", names(dt))]
Output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1: NA 1.30228879 -0.586898083 -1.02679871 -0.9280488 2
2: NA -1.00846632 -0.260183508 -0.78828113 -0.8712957 2
3: -0.40475601 0.22961832 0.004414558 -1.04496673 -0.1032775 3
4: 0.09559518 -0.58875533 1.360528488 -0.48798151 -0.6350380 3
5: -0.39312997 NA 0.292025300 1.13544025 -0.2487097 2
6: -1.15802973 1.01589098 0.445829196 -0.02029337 0.9758154 3
7: -0.02524740 -0.17334510 -1.455821490 -0.12165396 -0.4441740 3
8: 0.93627901 -0.92913166 0.407038460 2.04054914 -0.8347571 3
9: 1.20218530 0.54453181 0.513222262 0.05571475 -0.4858128 3
10: 0.84765702 0.07472934 1.367745731 -1.49924113 -1.3170490 3
I am trying to write an conditional statement with the following constraints. Below is an example data frame showing the problem I am running into.
Row <- c(1,2,3,4,5,6,7)
La <- c(51.25,51.25,51.75,53.25,53.25,54.25,54.25)
Lo <- c(128.25,127.75,127.25,119.75,119.25,118.75,118.25)
Y <- c(5,10,2,4,5,7,9)
Cl <- c("EF","EF","EF","EF","NA","NA","CE")
d <- data.frame(Row,La,Lo,Y,Cl)
Row La Lo Y Cl
1 1 51.25 128.25 5 EF
2 2 51.25 127.75 10 EF
3 3 51.75 127.25 2 EF
4 4 53.25 119.75 4 EF
5 5 53.25 119.25 5 NA
6 6 54.25 118.75 7 NA
7 7 55.25 118.25 9 CE
I would like to sum column "Y" (removing all values from that row) if "Cl" is NA with the corresponding "Lo" and "La" values that are close (equal to or less than 1.00). In effect, I want to remove NA from being in the data frame without losing the value of "Y", but instead adding this value to its closest neighbor.
I would like the return data frame to look like this:
Row2 <- c(1,2,3,4,7)
La2 <- c(51.25,51.25,51.75,53.25,55.25)
Lo2 <- c(128.25,127.75,127.25,119.75,118.25)
Y2 <- c(5,10,2,9,16)
Cl2 <- c("EF","EF","EF","EF","CE")
d2 <- data.frame(Row2,La2,Lo2,Y2,Cl2)
Row2 La2 Lo2 Y2 Cl2
1 1 51.25 128.25 5 EF
2 2 51.25 127.75 10 EF
3 3 51.75 127.25 2 EF
4 4 53.25 119.75 9 EF
5 7 55.25 118.25 16 CE
recent edit: If NA row is close to one row in terms of Lo value and same closeness to another row in La value, join by La value. If there are 2 equally close rows of Lo and La values, join by smaller La value.
Thank you for the help!
Here is a method to use if you can make some distance matrix m for the distance between all the (La, Lo) rows in your data. I use the output of dist, which is euclidean distance. The row with the lowest distance is selected, or the earliest such row if the lowest distance is shared by > 1 row.
w <- which(is.na(d$Cl))
m <- as.matrix(dist(d[c('La', 'Lo')]))
m[row(m) %in% w] <- NA
d$g <- replace(seq(nrow(d)), w, apply(m[,w], 2, which.min))
library(dplyr)
d %>%
group_by(g) %>%
summarise(La = La[!is.na(Cl)],
Lo = Lo[!is.na(Cl)],
Y = sum(Y),
Cl = Cl[!is.na(Cl)]) %>%
select(-g)
# # A tibble: 5 x 4
# La Lo Y Cl
# <dbl> <dbl> <dbl> <fct>
# 1 51.2 128. 5 EF
# 2 51.2 128. 10 EF
# 3 51.8 127. 2 EF
# 4 53.2 120. 9 EF
# 5 54.2 118. 16 CE
I am interested in performing multiple tests for a single variable with an associated factor that split the values into multiple groups. It is related to this question and, actually, I would like to get a solution of that kind but it is not exactly the same.
In my case, I have a single variable and multiple groups (eventually many). Expanding on this example:
library(reshape)
# Create a dataset
mu=34
stdv=5
Location=rep(c("Area_A","Area_B","Area_C"),5)
distro=rnorm(length(Location),mu,stdv)
id=seq(1:length(Location))
sample_long=data.frame(id,Location,distro)
sample_long
id Location distro
1 1 Area_A 34.95737
2 2 Area_B 31.30298
3 3 Area_C 35.86569
4 4 Area_A 40.45378
5 5 Area_B 36.12060
6 6 Area_C 28.29649
7 7 Area_A 30.64495
8 8 Area_B 29.70668
9 9 Area_C 33.22874
10 10 Area_A 25.29148
11 11 Area_B 32.35511
12 12 Area_C 34.69159
13 13 Area_A 26.89791
14 14 Area_B 35.30717
15 15 Area_C 40.64628
I would like to perform all-against-all tests among Areas, i.e. test(Area_A,Area_B), test(Area_A,Area_C) and test(Area_B,Area_C) (in a more general case, all the i<j possible tests).
A simple way to go is to transform the data into wide format:
# Reshape to wide format
sample_wide=reshape(sample_long,direction="wide",idvar="id",timevar="Location")
sample_wide
id distro.Area_A distro.Area_B distro.Area_C
1 1 34.95737 NA NA
2 2 NA 31.30298 NA
3 3 NA NA 35.86569
4 4 40.45378 NA NA
5 5 NA 36.12060 NA
6 6 NA NA 28.29649
7 7 30.64495 NA NA
8 8 NA 29.70668 NA
9 9 NA NA 33.22874
10 10 25.29148 NA NA
11 11 NA 32.35511 NA
12 12 NA NA 34.69159
13 13 26.89791 NA NA
14 14 NA 35.30717 NA
15 15 NA NA 40.64628
and then loop across all-against-all columns, for which I've seen several approximations more R-like than the following one in which I'm using for loops:
# Now compute the test
test.out=list()
k=0
for(i in 2:(dim(sample_wide)[2]-1)){ # All against all var groups
for(j in (i+1):dim(sample_wide)[2]){
k=k+1
test.out[[k]]=t.test(sample_wide[,i],
sample_wide[,j]) # store results in a list
}
}
But my question is not about which is the best solution given the wide format, but whether it is possible to find a solution for the problem working from the original long format, in line with the solutions found for the links I provided above that use dplyr, broom, etc.
This is a little trickier and less straightforward than I hoped. You can first figure out the combinations of locations and, to make it a little simpler, save that in a lookup table. I turned that into a long shape with an ID for each pair, which I'll use as a grouping variable on the data.
library(dplyr)
library(tidyr)
library(purrr)
set.seed(111)
# same data creation code
grps <- as.data.frame(t(combn(levels(sample_long$Location), 2))) %>%
mutate(pair = row_number()) %>%
gather(key, value = loc, -pair) %>%
select(-key)
grps
#> pair loc
#> 1 1 Area_A
#> 2 2 Area_A
#> 3 3 Area_B
#> 4 1 Area_B
#> 5 2 Area_C
#> 6 3 Area_C
Joining the lookup to the data frame doubles the rows—that will differ depending on how many levels you're combining. Note also I dropped your ID column since it didn't seem necessary right now. Nest, do the t-test, and tidy the results.
sample_long %>%
select(-id) %>%
inner_join(grps, by = c("Location" = "loc")) %>%
group_by(pair) %>%
nest() %>%
mutate(t_test = map(data, ~t.test(distro ~ Location, data = .)),
tidied = map(t_test, broom::tidy)) %>%
unnest(tidied)
#> # A tibble: 3 x 13
#> pair data t_test estimate estimate1 estimate2 statistic p.value
#> <int> <lis> <list> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 <tib… <htes… -0.921 31.8 32.7 -0.245 0.816
#> 2 2 <tib… <htes… -1.48 31.8 33.3 -0.383 0.716
#> 3 3 <tib… <htes… -0.563 32.7 33.3 -0.305 0.769
#> # … with 5 more variables: parameter <dbl>, conf.low <dbl>,
#> # conf.high <dbl>, method <chr>, alternative <chr>
If you needed to, you could do something to show which locations are in each pair—joining with the lookup table would be one way to do this.
I'm realizing also that you mentioned wanting to use broom functions afterwards, but didn't specify that you need a broom::tidy call. In that case, just drop the last 2 lines.
A little bit of base R will do the trick:
combn(x=unique(sample_long$Location), m=2, simplify=FALSE,
FUN=function(l) {
t.test(distro ~ Location, data=subset(sample_long, Location %in% l))
})
combn will generate all combinations of elements of x taken m at a time (sic). Combined with subset, you will apply your test to subsets of your data.frame.
I have a Dataframe called "myseis" which contains Accelerationdata:
X Y Z dt
1 -0.843 3.854 8.247 0
2 -0.598 3.795 8.110 20
3 -0.402 3.834 8.325 19
4 -0.353 3.883 8.414 20
5 -0.117 4.059 8.404 20
6 0.039 3.854 8.159 20
7 0.235 3.726 7.894 20
8 0.372 3.706 7.717 20
9 0.451 3.903 7.835 21
10 0.568 4.197 8.061 19
I want to add a 5th column which contains the absolute time. Something like:
X Y Z dt Date/Time
1 -0.843 3.854 8.247 0 2018-08-020 10:00:00,000
2 -0.598 3.795 8.110 20 2018-08-020 10:00:00,020
3 -0.402 3.834 8.325 19 2018-08-020 10:00:00,039
4 -0.353 3.883 8.414 20 2018-08-020 10:00:00,059
etc.
So I want to add up the "dt" column (milliseconds) to the Starttime which is in POSIXct.
I kind of figured out that I could do it with:
time <- c(starttime)
i <- 1
while(i < nrow(myseis))
{
i <- i+1
time <- c(time,time[i-1]+myseis[i,4])
}
myseis <- data.frame(myseis,time)
On a small scale, this checks out but in this case, nrow(myseis) = 247118 and so this would take forever.
Is there another way? I want to plot a subset of this Dataframe later.
Thanks
I am trying to order data by the column weightFisher. However, it is almost as if R does not process e values as low, because all the e values are skipped when I try to order from smallest to greatest.
Code:
resultTable_bon <- GenTable(GOdata_bon,
weightFisher = resultFisher_bon,
weightKS = resultKS_bon,
topNodes = 15136,
ranksOf = 'weightFisher'
)
head(resultTable_bon)
#create Fisher ordered df
indF <- order(resultTable_bon$weightFisher)
resultTable_bonF <- resultTable_bon[indF, ]
what resultTable_bon looks like:
GO.ID Term Annotated Significant Expected Rank in weightFisher
1 GO:0019373 epoxygenase P450 pathway 19 13 1.12 1
2 GO:0097267 omega-hydroxylase P450 pathway 9 7 0.53 2
3 GO:0042738 exogenous drug catabolic process 10 7 0.59 3
weightFisher weightKS
1 1.9e-12 0.79744
2 7.9e-08 0.96752
3 2.5e-07 0.96336
what "ordered" resultTable_bonF looks like:
GO.ID Term Annotated Significant Expected Rank in weightFisher
17 GO:0014075 response to amine 33 7 1.95 17
18 GO:0034372 very-low-density lipoprotein particle re... 11 5 0.65 18
19 GO:0060710 chorio-allantoic fusion 6 4 0.35 19
weightFisher weightKS
17 0.00014 0.96387
18 0.00016 0.83624
19 0.00016 0.92286
As #bhas says, it appears to be working precisely as you want it to. Maybe it's the use of head() that's confusing you?
To put your mind at ease, try it with something simpler
dtf <- data.frame(a=c(1, 8, 6, 2)^-10, b=c(7, 2, 1, 6))
dtf
# a b
# 1 1.000000e+00 7
# 2 9.313226e-10 2
# 3 1.653817e-08 1
# 4 9.765625e-04 6
dtf[order(dtf$a), ]
# a b
# 2 9.313226e-10 2
# 3 1.653817e-08 1
# 4 9.765625e-04 6
# 1 1.000000e+00 7
Try the following :
resultTable_bon$weightFisher <- as.numeric (resultTable_bon$weightFisher)
Then :
resultTable_bonF <- resultTable_bon[order(resultTable_bonF$weightFisher),]