I have two data frames which is mentioned below :-
df1 = data.frame(df1_col = c(0.00002,0.00010,0.00020,0.00100,0.00200))
df2 = data.frame(df2_col1=c(0.00001702406727,0.00002002614159,0.00002018336933,0.0000977206871,0.00010785618371,0.00018966630497,0.00020173904639,0.00099759142132,0.00104912583361,0.00194016482197,0.00200732582737)
now i want to compare each row from df1 with each row of df2 and find out row number(index) of first greatest in df2, from index number i will take data from df2 for that corresponding index number.
note :- df1 is having less number of rows than df2
i tired using looping through each row for df1 and df2 and then used which function but it didn't work any other simple method is there?
my desired output will be :-
df3 = data.frame(df1_col1 = c(0.00002,0.00010,0.00020,0.00100,0.00200), df2_col2=c(0.00002002614159,0.00010785618371,0.00020173904639,0.00104912583361,0.00200732582737))
sorry for any inconvenience . this is my first question . thank you in advance !!!!
NEW DATA
df1=data.frame(df1_col1=c(0.00002,
0.00010,
0.00020,
0.00100,
0.00200,
0.00250,
0.00400,
0.00500,
0.01000,
0.02000,
0.03000))
df2=data.frame(df2_col1=c(0.00014940969624,
0.00015836812803,
0.00016803247695,
0.00017844541097,
0.00018966630497,
0.00020173904639,
0.00021473722873,
0.00022871767705,
0.00024376616425,
0.00025995387854,
0.00027740350512,
0.00029622853518,
0.00031659411339,
0.00033867002512,
0.00036270484799,
0.00038895378722,
0.00041774363175,
0.00044938435908,
0.00048425557455,
0.00052270555240,
0.00056516734709,
0.00058803959079,
0.00061213685774,
0.00063753303271,
0.00066436020792,
0.00069271759254,
0.00072278590327,
0.00075471258665,
0.00078871891980,
0.00082495135506,
0.00086370796092,
0.00090520244315,
0.00094974375995,
0.00099759142132,
0.00104912583361,
0.00110468513550,
0.00116471796237,
0.00122961947450,
0.00129997381226,
0.00137633261223,
0.00145941148983,
0.00150367150729,
0.00154993475674,
0.00164886456582,
0.00170177926583,
0.00175721404389,
0.00181530011799,
0.00187625634148,
0.00194016482197,
0.00200732582737,
0.00207790286653,
0.00215216527238,
0.00223029743932,
0.00231263720905,
0.00239942910043,
0.00249104495897,
0.00258772950799,
0.00268995918708,
0.00279808620674,
0.00291260618091,
0.00303393957317,
0.00316270550389,
0.00329945165204,
0.00344491415323,
0.00359970294452,
0.00376478162054,
0.00394100416242,
0.00412946943549,
0.00433124287660,
0.00454769286700,
0.00466181286329,
0.00478015613269,
0.00490287219562,
0.00503023860438,
0.00516231516987,
0.00529948405762,
0.00544190977016,
0.00558989424000,
0.00574363032708,
0.00590348262538,
0.00606970711776,
0.00624269632708,
0.00642267044700,
0.00661012222834,
0.00680537865824,
0.00700890517615,
0.00722105704960,
0.00744238908118,
0.00767333589656,
0.00791450623655,
0.00816629638134,
0.00842946839953,
0.00870457800002,
0.00899236514987,
0.00929347419740,
0.00960877976327,
0.00993906664874,
0.01028531629401,
0.01064836560114,
0.01102939995868,
0.01142946986211,
0.01184984756038,
0.01229174638031,
0.01275666542031,
0.01324605799479,
0.01376164646858,
0.01430505034994,
0.01487834806441,
0.01517680797833,
0.01548355354501,
0.01579883472796,
0.01612301677381,
0.01645629840316,
0.01679913158769,
0.01715180723470,
0.01751473695387,
0.01788823000264,
0.01827276128134,
0.01866870741769,
0.01907658694645,
0.01949670526428,
0.01992970835114,
0.02037603813266,
0.02083628226895,
0.02131092986923,
0.02180065884019,
0.02230604531786,
0.02282781673939,
0.02336654192662,
0.02392307856762,
0.02449812744245,
0.02509254651992,
0.02570710318883,
0.02634276037547,
0.02700040459895,
0.02768108720588,
0.02838568468843,
0.02911543211252,
0.02987138853965,
0.03065480656468,
0.03146687637367,
0.03230902465341,
0.03318262955971,
0.03408928114548,
0.03503049532094,
0.03600813039164,
0.03702399530373,
0.03808014377487,
0.03917864953982,
0.04032188867370,
0.04151227866069,
0.04275252873894,
0.04404534794164,
0.04539391077701,
0.04609007134882,
0.04680142876348))
First sort df2
> df2=df2[order(df2$df2_col1),,drop=F]
using apply to obtain the indices
> tmp=apply(df1,1,function(x){which.max(df2$df2_col1>x)})
[1] 2 5 7 9 11
using these indices in df2 you get
> df2[tmp,]
[1] 2.002614e-05 1.078562e-04 2.017390e-04 1.049126e-03 2.007326e-03
Here's one way but I bet there are more concise/elegant alternatives.
library(dplyr)
tidyr::crossing(df1, df2) %>%
filter(df1_col < df2_col1) %>%
group_by(df1_col) %>%
slice_min(df2_col1) %>%
ungroup()
I am trying to work on a for loop to make running a function I've developed more efficient.
However, when I put it in a for loop, it is overwriting columns that it should not be and returning incorrect results.
Edit: The error is that in the resulting dataframe MiSeq_Bord_Outliers_table0, the resulting columns containing label Outlier_type is returning incorrect outputs.
As per the Outlier_Hunter function, when Avg_Trim_Cov and S2_Total_Read_Pairs_Processed are below their
respective Q1 Thresholds their respective Outlier_type columns should read "Lower_Outlier", if between Q1 & Q3 Threshold, "Normal" and if above Q3 Threshold then "Upper_outlier". But when the for loop is executed, only "Upper_outlier" is shown in the Outlier_type columns.
Edit: The inputs have been simplified and tested on the different computer with a clean console. If there were any artifacts there before, they should have been eliminated now, and there should be no errors here now. It is important to run the outlier_results_1var part first. If you test run this code and get errors, please let me know which part failed.
Edit: MiSeq_Bord_Outliers_table0_error is the error that is being reproduced. This is the error result, not an input.
Can someone please tell me why is it returning these incorrect results and what I can do to fix it? I will upload the relevant code below. Or is there another way to do this without a for loop?
#libraries used
library(tidyverse)
library(datapasta)
library(data.table)
library(janitor)
library(ggpubr)
library(labeling)
#2.) Outlier_Hunter Function
#Function to Generate the Outlier table
#Outlier Hunter function takes 4 arguments: the dataset, column/variable of interest,
#Q1 and Q3. Q1 and Q3 are stored in the results of Quartile_Hunter.
#Input ex: MiSeq_Bord_final_report0, Avg_Trim_Cov, MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1]
#Usage ex: Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
#MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1])
#Here is the Function to get the Outlier Table
Outlier_Hunter <- function(Platform_Genus_final_report0, my_col, Q1, Q3) {
#set up and generalize the variable name you want to work with
varname <- enquo(my_col)
#print(varname) #just to see what variable the function is working with
#get the outliers
Platform_Genus_Variable_Outliers <- Platform_Genus_final_report0 %>%
select(ReadID, Platform, Genus, !!varname) %>%
#Tell if it is an outlier, and if so, what kind of outlier
mutate(
Q1_Threshold = Q1,
Q3_Threshold = Q3,
Outlier_type =
case_when(
!!varname < Q1_Threshold ~ "Lower_Outlier",
!!varname >= Q1_Threshold & !!varname <= Q3_Threshold ~ "Normal",
!!varname > Q3_Threshold ~ "Upper_Outlier"
)
)
}
#MiSeq_Bord_Quartiles entries
MiSeq_Bord_Quartiles <- data.frame(
stringsAsFactors = FALSE,
row.names = c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed"),
Platform = c("MiSeq", "MiSeq"),
Genus = c("Bord", "Bord"),
Min = c(0.03, 295),
First_Quartile = c(80.08, 687613.25),
Median = c(97.085, 818806.5),
Third_Quartile = c(121.5625, 988173.75),
Max = c(327.76, 2836438)
)
#Remove the hashtag below to test if what you have is correct
#datapasta::df_paste(head(MiSeq_Bord_Quartiles, 5))
#dataset entry
MiSeq_Bord_final_report0 <- data.frame(
stringsAsFactors = FALSE,
ReadID = c("A005_20160223_S11_L001","A050_20210122_S6_L001",
"A073_20210122_S7_L001",
"A076_20210426_S11_L001",
"A080_20210426_S12_L001"),
Platform = c("MiSeq","MiSeq",
"MiSeq","MiSeq","MiSeq"),
Genus = c("Bordetella",
"Bordetella","Bordetella",
"Bordetella","Bordetella"),
Avg_Raw_Read_bp = c(232.85,241.09,
248.54,246.99,248.35),
Avg_Trimmed_Read_bp = c(204.32,232.6,
238.56,242.54,244.91),
Avg_Trim_Cov = c(72.04,101.05,
92.81,41.77,54.83),
Genome_Size_Mb = c(4.1, 4.1, 4.1, 4.1, 4.1),
S1_Input_reads = c(1450010L,
1786206L,1601542L,710792L,925462L),
S1_Contaminant_reads = c(12220L,6974L,
7606L,1076L,1782L),
S1_Total_reads_removed = c(12220L,6974L,
7606L,1076L,1782L),
S1_Result_reads = c(1437790L,
1779232L,1593936L,709716L,923680L),
S2_Read_Pairs_Written = c(712776L,882301L,
790675L,352508L,459215L),
S2_Total_Read_Pairs_Processed = c(718895L,889616L,
796968L,354858L,461840L)
)
MiSeq_Bord_final_report0
#Execution for 1 variable
outlier_results_1var <- Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
MiSeq_Bord_Quartiles$First_Quartile[1], MiSeq_Bord_Quartiles$Third_Quartile[1])
#Now do it with a for loop
col_var_outliers <- row.names(MiSeq_Bord_Quartiles)
#col_var_outliers <- c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed")
#change line above to change input of variables few into Outlier Hunter Function
outlier_list_MiSeq_Bord <- list()
for (y in col_var_outliers) {
outlier_results0 <- Outlier_Hunter(MiSeq_Bord_final_report0, y, MiSeq_Bord_Quartiles[y, "First_Quartile"], MiSeq_Bord_Quartiles[y, "Third_Quartile"])
outlier_results1 <- outlier_results0
colnames(outlier_results1)[5:7] <- paste0(y, "_", colnames(outlier_results1[, c(5:7)]), sep = "")
outlier_list_MiSeq_Bord[[y]] <- outlier_results1
}
MiSeq_Bord_Outliers_table0 <- reduce(outlier_list_MiSeq_Bord, left_join, by = c("ReadID", "Platform", "Genus"))
#the columns containing label Outlier_type is where the code goes wrong.
#When Avg_Trim_Cov and S2_Total_Read_Pairs_Processed are below their
#respective Q1 Thresholds their respective Outlier_type columns should read
#"Lower_Outlier", if between Q1 & Q3 Threshold, "Normal" and if above Q3
#Threshold then "Upper_outlier". But when the for loop is executed, only
"Upper_outlier" is shown in the Outlier_type columns.
datapasta::df_paste(head(MiSeq_Bord_Outliers_table0, 5))
MiSeq_Bord_Outliers_table0_error <- data.frame(
stringsAsFactors = FALSE,
ReadID = c("A005_20160223_S11_L001",
"A050_20210122_S6_L001",
"A073_20210122_S7_L001","A076_20210426_S11_L001",
"A080_20210426_S12_L001"),
Platform = c("MiSeq",
"MiSeq","MiSeq","MiSeq",
"MiSeq"),
Genus = c("Bordetella","Bordetella","Bordetella",
"Bordetella","Bordetella"),
Avg_Trim_Cov = c(72.04,
101.05,92.81,41.77,54.83),
Avg_Trim_Cov_Q1_Threshold = c(80.08,
80.08,80.08,80.08,80.08),
Avg_Trim_Cov_Q3_Threshold = c(121.5625,
121.5625,121.5625,121.5625,
121.5625),
Avg_Trim_Cov_Outlier_type = c("Upper_Outlier","Upper_Outlier",
"Upper_Outlier","Upper_Outlier",
"Upper_Outlier"),
S2_Total_Read_Pairs_Processed = c(718895L,
889616L,796968L,354858L,
461840L),
S2_Total_Read_Pairs_Processed_Q1_Threshold = c(687613.25,
687613.25,687613.25,
687613.25,687613.25),
S2_Total_Read_Pairs_Processed_Q3_Threshold = c(988173.75,
988173.75,988173.75,
988173.75,988173.75),
S2_Total_Read_Pairs_Processed_Outlier_type = c("Upper_Outlier","Upper_Outlier",
"Upper_Outlier","Upper_Outlier",
"Upper_Outlier")
)
For use in a loop like you do, it would be more useful to write your Outlier_Hunter() function to take the target column as a character string rather than an expression.
To do that, try replacing all instances of !!varname in your function with .data[[my_col]], and remove the enquo() line altogether.
Note that with these changes, you also need to change how you call the function when you don't have the column name in a variable. For example, your single execution would become:
Outlier_Hunter(
MiSeq_Bord_final_report0,
"Avg_Trim_Cov",
MiSeq_Bord_Quartiles$First_Quartile[1],
MiSeq_Bord_Quartiles$Third_Quartile[1]
)
For more info about programming with tidy evaluation functions, you may find this rlang vignette useful.
I have a 24 data sets of 93 observations each. There are only two variables, a factor (size) and it's response (percent). The factor value ranges from 0-2000. I would like to combine these observations into three groups based on factor values (0-2, 2-50, and 50-2000) and see the total combined response value for each. I have tried using the group_by and summarize functions, but I am fairly new with r and I am in over my head.
In addition, is it possible to automate this so that one string of code can do this for all 24 of my data sets? They are saved as different text files in the same folder. I don't know the limitations of r, so this might not be possible. If necessary, one code that I could run 24 times would still get the job done.
Here's an example of one of the data sets -
>dput(head(data))
structure(list(run.size.percent = structure(c(2L, 13L, 24L, 35L,
46L, 57L), .Label = c(",2000,", "1,0.375,0.013", "10,0.868,0.11",
"11,0.953,0.12", "12,1.047,0.12", "13,1.149,0.13", "14,1.261,0.14",
"15,1.385,0.14", "16,1.520,0.15", "17,1.668,0.15", "18,1.832,0.16",
"19,2.011,0.17", "2,0.412,0.023", "20,2.207,0.17", "21,2.423,0.18",
"22,2.660,0.19", "23,2.920,0.20", "24,3.205,0.21", "25,3.519,0.22",
"26,3.863,0.24", "27,4.240,0.25", "28,4.655,0.26", "29,5.110,0.28",
"3,0.452,0.034", "30,5.610,0.30", "31,6.158,0.31", "32,6.760,0.33",
"33,7.421,0.35", "34,8.147,0.37", "35,8.943,0.39", "36,9.817,0.42",
"37,10.78,0.45", "38,11.83,0.47", "39,12.99,0.50", "4,0.496,0.049",
"40,14.26,0.53", "41,15.65,0.56", "42,17.18,0.58", "43,18.86,0.59",
"44,20.70,0.59", "45,22.73,0.58", "46,24.95,0.55", "47,27.39,0.52",
"48,30.07,0.49", "49,33.01,0.46", "5,0.545,0.061", "50,36.24,0.45",
"51,39.78,0.45", "52,43.67,0.45", "53,47.94,0.44", "54,52.62,0.42",
"55,57.77,0.38", "56,63.41,0.35", "57,69.61,0.32", "58,76.42,0.31",
"59,83.89,0.33", "6,0.598,0.072", "60,92.09,0.36", "61,101.1,0.42",
"62,111.0,0.49", "63,121.8,0.59", "64,133.7,0.74", "65,146.8,0.94",
"66,161.2,1.19", "67,176.9,1.49", "68,194.2,1.82", "69,213.2,2.18",
"7,0.656,0.083", "70,234.1,2.55", "71,256.9,2.94", "72,282.1,3.34",
"73,309.6,3.78", "74,339.9,4.25", "75,373.1,4.73", "76,409.6,5.20",
"77,449.7,5.60", "78,493.6,5.87", "79,541.9,5.93", "8,0.721,0.093",
"80,594.9,5.77", "81,653.0,5.37", "82,716.8,4.77", "83,786.9,4.03",
"84,863.9,3.21", "85,948.3,2.36", "86,1041,1.55", "87,1143,0.81",
"88,1255,0.30", "89,1377,0.056", "9,0.791,0.10", "90,1512,0.0044",
"91,1660,0", "92,1822,0"), class = "factor")), row.names = c(NA,
6L), class = "data.frame")
Thanks very much for any help! Please let me know if there is anything I need to clarify.
1) Summary Table
You were on the right track with the group_by/summarise idea! There are just a few steps to do first.
# load tidyverse packages
library(tidyverse)
# load dataset
# (I did this differently than you did in the question - I'm not familiar with the structure function)
data <- tibble(x= c("0,2000,0", "1,0.375,0.013", "10,0.868,0.11",
"11,0.953,0.12", "12,1.047,0.12", "13,1.149,0.13", "14,1.261,0.14",
"15,1.385,0.14", "16,1.520,0.15", "17,1.668,0.15", "18,1.832,0.16",
"19,2.011,0.17", "2,0.412,0.023", "20,2.207,0.17", "21,2.423,0.18",
"22,2.660,0.19", "23,2.920,0.20", "24,3.205,0.21", "25,3.519,0.22",
"26,3.863,0.24", "27,4.240,0.25", "28,4.655,0.26", "29,5.110,0.28",
"3,0.452,0.034", "30,5.610,0.30", "31,6.158,0.31", "32,6.760,0.33",
"33,7.421,0.35", "34,8.147,0.37", "35,8.943,0.39", "36,9.817,0.42",
"37,10.78,0.45", "38,11.83,0.47", "39,12.99,0.50", "4,0.496,0.049",
"40,14.26,0.53", "41,15.65,0.56", "42,17.18,0.58", "43,18.86,0.59",
"44,20.70,0.59", "45,22.73,0.58", "46,24.95,0.55", "47,27.39,0.52",
"48,30.07,0.49", "49,33.01,0.46", "5,0.545,0.061", "50,36.24,0.45",
"51,39.78,0.45", "52,43.67,0.45", "53,47.94,0.44", "54,52.62,0.42",
"55,57.77,0.38", "56,63.41,0.35", "57,69.61,0.32", "58,76.42,0.31",
"59,83.89,0.33", "6,0.598,0.072", "60,92.09,0.36", "61,101.1,0.42",
"62,111.0,0.49", "63,121.8,0.59", "64,133.7,0.74", "65,146.8,0.94",
"66,161.2,1.19", "67,176.9,1.49", "68,194.2,1.82", "69,213.2,2.18",
"7,0.656,0.083", "70,234.1,2.55", "71,256.9,2.94", "72,282.1,3.34",
"73,309.6,3.78", "74,339.9,4.25", "75,373.1,4.73", "76,409.6,5.20",
"77,449.7,5.60", "78,493.6,5.87", "79,541.9,5.93", "8,0.721,0.093",
"80,594.9,5.77", "81,653.0,5.37", "82,716.8,4.77", "83,786.9,4.03",
"84,863.9,3.21", "85,948.3,2.36", "86,1041,1.55", "87,1143,0.81",
"88,1255,0.30", "89,1377,0.056", "9,0.791,0.10", "90,1512,0.0044",
"91,1660,0", "92,1822,0")) %>%
# separate into three fields
separate(x,
into = c("run", "size", "percent"),
sep = ",") %>%
# only keep useful fields - size and percent
select(size, percent) %>%
# change field types to numeric
mutate_all(as.numeric)
# group by size: categories [0,2), [2,5), [5,2000]
data_summary <- data %>%
mutate(size_bin = cut(size,
breaks = c(0,2,5,2000),
include.lowest = TRUE,
right = FALSE)) %>%
group_by(size_bin) %>%
summarise(percent_sum = sum(percent))
# take a look at the result
data_summary
2) Repeat process over several files
Yes, you can definitely set this up to run over many files in a folder!
Do you want all of the files to feed into one dataset? If so, here's the code you'd use:
data_all <- list.files("folder_name/") %>%
map_df(~read_csv(path = paste0("folder_name/", .)) %>%
# only keep useful fields - size and percent
select(size, percent) %>%
# change field types to numeric
mutate_all(as.numeric) %>%
# group by size: categories [0,2), [2,5), [5,2000]
mutate(size_bin = cut(size,
breaks = c(0,2,5,2000),
include.lowest = TRUE,
right = FALSE)) %>%
group_by(size_bin) %>%
summarise(percent_sum = sum(percent))
)
If you want to keep the datasets separate, the code would be different. (I'm not sure how to code that right now, but I'll look into it if that something you're interested in!)