Find the second largest value rowwise with dplyr R - r
Problem:
I am working with wages data and I want to flag outliers as possible measurement errors. For doing so, I am combining two criteria:
To receive more than twice the value of the 99th percentile of wages within a given year, relative to the whole distribution of wages on my dataset (comparison criteria between persons, within year)
To receive more than twice the value of the second highest wage within a same person, across years. That is an intra-individual criteria (comparison criteria within person, between years).
I accomplished to code the first criteria, but I am having some trouble with coding the second one.
My data is in the wide format. Perhaps the solution to my problem can be easier achieved by reshaping the data to the long format, but as I am working in a multi-author project, if I use this solution I need to reshape it back to the wide format again.
Data example:
Below, I provide some rows of my data with cases that already met the first criteria:
df <- structure(list(
wage_2010 = c(120408.54, 11234.67, 19918.64, NA, 66006.32, 40581.36, 344587.84, 331970.28, NA, 161351.45, NA, 115310.68, 323336.27, 9681.69, NA, 682324.53, 43764.76, 134023.61, 78195.16, 141231.5, 48163.23, 71259.66, 73858.65, 57737.6, NA, 182837.23), wage_2011 = c(413419.86, 24343.04, 36349.02, NA, 99238.53, 18890.34, 129921.58, 108714.29, NA, 169289.89, 36158.73, 129543.51, 130791.99, 13872.76, 4479.58, 222327.52, 826239.14, 48892.78, 78506.06, 111569.8, 653239.41, 813158.54, 72960.17, 80193.15, NA, 209796.19), wage_2012 = c(136750.86, 77386.62, 177528.17, 86512.48, 375958.76, 20302.29, 145373.42, 91071.64, 95612.23, 176866.72, 85244.44, 225698.7, 181093.52, 162585.23, 147918.83, 254057.11, 72845.46, 86001.31, 80958.22, 105629.12, 77723.77, 115217.74, 68959.04, 111843.87, 85180.26, 261942.95 ),
wage_2013 = c(137993.48, 104584.84, 239822.37, 95688.8, 251573.14, 21361.93, 142771.58, 92244.51, 111058.93, 208013.94, 111326.07, 254276.36, 193663.33, 225404.84, 84135.55, 259772.16, 100031.38, 100231.81, 824271.38, 107336.19, 95292.2, 217071.19, 125665.58, 74513.66, 116227.01, 245161.73), wage_2014 = c(134914.8, 527180.87, 284218.4, 112332.41, 189337.74, 23246.46, 144070.09, 92805.77, 114123.3, 251389.07, 235863.98, 285511.12, 192950.23, 205364.45, 292988.3, 318408.56, 86255.91, 497960.18, 85467.13, 152987.99, 145663.31, 242682.93, 184123.01, 107423.03, 132046.43, 248928.89), wage_2015 = c(168812.65, 145961.09, 280556.86, 256268.69, 144549.45, 23997.1, 130253.75, NA, 115522.88, 241031.91, 243697.87, 424135.76, 15927.33, 213203.96, 225118.19, 298042.59, 77749.09, 151336.85, 88596.38, 121741.45, 34054.26, 206284.71, 335127.7, 201891.17, 189409.04, 246440.69),
wage_2016 = c(160742.14, 129892.09, 251333.29, 137192.73, 166127.1, 537611.12, 139350.84, NA, 115395.21, 243154.02, 234685.36, 903334.7, NA, 205664.08, 695079.91, 33771.37, 100938.19, 138864.28, 58658.4, 98576.95, NA, 144613.53, 430393.04, 217989.1, 229369.56, 600079.86), wage_2017 = c(175932.3, 138128.41, 584536.47, 143506.22, 61674.63, 1442.8, 126084.46, NA, 575771.83, 586909.69, 372954.89, 701815.37, NA, 402347.33, 93873.2, NA, 96792.96, 172908.08, 89006.92, 631645.41, NA, 72183.55, 579455.71, 294539.56, 353615.43, 151327.43), wage_2018 = c(146111.42, 149313.9, 627679.77, 850182.4, 72654.62, 9129.35, 41544.24, NA, 248020.12, 334280.68, 611781.99, 597465.2, NA, 535628.5, 63369.44, NA, 93710.71, 146769.63, 100736.71, 108022.87, NA, 79019.43, 772012.47, 549097.81, 504183.59, 99129.6),
outlier_2010 = c(0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2011 = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0), outlier_2012 = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2013 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), outlier_2014 = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2015 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2016 = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), outlier_2017 = c(0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), outlier_2018 = c(0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0)),
groups = structure(list(.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), row.names = c(NA, -26L), class = c("tbl_df", "tbl", "data.frame")), row.names = c(NA, -26L), class = c("rowwise_df", "tbl_df", "tbl", "data.frame"))
I have averages anual wages from 2010 to 2018, that is, 9 points in time. However, it seems to be hard to use a solution with the quantile function, because of possible missing values for some individuals in some years.
What I have tried:
So far I am using a median function within the dplyer approach. I flag as an outlier (possible error) if, in one given year, the individual receives more than twice the median of what he received across the years:
library(dplyr)
df1 <- df %>%
rowwise %>%
mutate(
median_wage = median(c(wage_2010, wage_2011, wage_2012, wage_2013, wage_2014, wage_2015, wage_2016, wage_2017, wage_2018), na.rm=T)) %>%
mutate(
individual_threshold = median_wage * 2,
) %>%
mutate(
outlier_2010 = case_when (wage_2010 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2011 = case_when (wage_2011 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2012 = case_when (wage_2012 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2013 = case_when (wage_2013 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2014 = case_when (wage_2014 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2015 = case_when (wage_2015 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2016 = case_when (wage_2016 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2017 = case_when (wage_2017 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2018 = case_when (wage_2018 > individual_threshold ~ 1, TRUE ~ 0))
However, when I inspect the data, I see that I am coding as outlier possible legitimate wages. For example, in the third row/person of my data, I am flagging as outliers wages in 2017 and 2018. However, as we can see, there is a pattern of increase in this person's wage. Although he receives more than twice his median wage in these years, probably that is not a mistake, as the increase was recorded in two years in a row.
In the forth row, however, the 2018 wage is more likely to be wrongly reported, since there is not a similar wage to that one for the same person. In 2018 year, that person wage grew more than 4 times than it was ever before (and also became more than twice the 99th percentile of the whole distribution).
Summing up:
I want to write a code to analyse 9 variables for every individual (or rowwise): wage_2010-2018, and compare the highest value to the second highest value. If the highest value is more than twice the size of the second highest value, I flag it as a possible measurement error. Preferably within dplyr.
Here's a way to do this with a helper function.
library(dplyr)
compare_2nd_highest <- function(x) {
#Sort the wages in descending order
x1 <- sort(x, decreasing = TRUE)
#Is the highest value more than double of second highest value
x1[1] > (x1[2] * 2)
}
df %>%
rowwise() %>%
mutate(is_outlier = compare_2nd_highest(c_across(starts_with('wage')))) %>%
ungroup
Related
Aggregate similar constructs/ FA with binary variables
I would like to aggregate, in order to reduce the number of constructs, its following data frame containing only binary variables that correspond to "yes/no", its following data frame (first 10 row). The original data frame contains 169 rows. outcome <- structure(list(Q9_Automazione.processi = c(0, 0, 0, 0, 0, 0, 1, 1, 1, 0), Q9_Velocita.Prod = c(1, 0, 0, 1, 0, 0, 1, 1, 1, 0), Q9_Flessibilita.Prod = c(0, 0, 0, 1, 0, 0, 1, 1, 0, 1), Q9_Controllo.processi = c(0, 0, 0, 1, 0, 0, 1, 1, 0, 0), Q9_Effic.Magazzino = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q9_Riduz.Costi = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 1), Q9_Miglior.Sicurezza = c(0, 0, 0, 0, 0, 0, 1, 0, 1, 1), Q9_Connett.Interna = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0), Q9_Connett.Esterna = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Virtualizzazione = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Innov.Prod = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Q9_Person.Prod = c(0, 1, 0, 1, 0, 1, 0, 0, 0, 1), Q9_Nuovi.Mercati = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Nuovi.BM = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Perform.Energ = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Perform.SostAmb = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, 10L), class = "data.frame") I have tried performing factor analysis via the tethracoric method on the obtained correlation matrix ( the obtained value from the KMO function turns out to be inadequate) both directly on the dataframe and then using tethracoric correletions in fafunction (using cor = "tet" I get a negative Tucker Lewis Index). I have been reading up on this but cannot find a methodology that is adequate and of which I am certain of the correctness of the analysis. So basically what I would like to achieve is to aggregate similar constructs, e.g., assess whether column 5 has value 1 (i.e., "yes") almost always when column 11 has value 1 and then aggregate. Here the code that I try to used library(psych) tet <- tetrachoric(outcome) corrplot(tet$rho, "ellipse", tl.cex = 0.75, tl.col = "black") par(mfrow = c(1,2)) corr_matrix %>% ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=2) KMO(corr_matrix) cortest.bartlett(corr_matrix) fa.parallel(corr_matrix, fm = "ml") factor <- fa(corr_matrix, nfactors = 3, rotate = "oblimin", fm = "ml") print(factor, cut = 0.3, digits = 3) # -------- Pearson -------- cor(outcome, method = 'pearson', use = "pairwise.complete.obs") %>% ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=2) KMO(outcome) cortest.bartlett(outcome) fa.parallel(outcome) factor1 <- fa(outcome, nfactors = 3, rotate = "oblimin", cor = "tet", fm = "ml") print(factor1, cut = 0.3, digits = 3)
Creating a function to find precision by group
I have the following dataframe for which I am trying to calculate the precision of observations by group. df<- structure(list(BLG = c(77.634011090573, 119.341563786008, 12.0603015075377, 0, 155.275381552754, 117.391304347826, 81.1332904056665, 3.96563119629874, 91.566265060241), GSF = c(11.090573012939, 4.11522633744856, 0, 0, 0, 0, 0, 0, 0), LMB = c(73.9371534195933, 28.8065843621399, 24.1206030150754, 20.2360876897133, 59.721300597213, 13.0434782608696, 38.6349001931745, 31.7250495703899, 28.9156626506024), YLB = c(14.7874306839187, 4.11522633744856, 0, 0, 0, 0, 0, 0, 0), BLC = c(7.39371534195933, 0, 0, 20.2360876897133, 3.9814200398142, 0, 0, 7.93126239259749, 9.63855421686747), WHC = c(0, 0, 0, 0, 3.9814200398142, 0, 0, 0, 0), RSF = c(0, 0, 0, 0, 11.9442601194426, 0, 0, 0, 4.81927710843374 ), CCF = c(0, 0, 0, 0, 0, 0, 0, 0, 0), BLB = c(0, 0, 0, 0, 0, 0, 0, 0, 0), group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L)), row.names = c(NA, -9L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x00000270a7061ef0>) I am trying to find the precision with this formula: Y_estimated= the value of in each cell of df Y_true= y_true<- c(83, 10, 47, 8, 9, 6, 12, 5, 8) #the true value for each column in df R= number of observations in each group (in this case=3) After applying the formula, I should have 3 measures of precision for each column. But I am unsure of how to make this formula into a function that will do this. Specifically the applying the epsilon by group and defining R. I've been working on the following: estimate = function(df, y_true) { R = 3 y_estimated = (df, .SD) (sum((sqrt( (y_estimated - y_true)^2 / 3))) / y_true) * 100 } But apart from this throwing errors (I think from the .SD in the y_estimated), I have to manually put in the value of R which I hope to not have to do given that this will be applied on data frames with multiple group sizes. Any help would be greatly appreciated.
Create bar plot for every level of a factor in a wide format data frame
I'm trying to create a bar plot using ggplot2 and my data is in this format: dput here: structure(list(clade = structure(c(1L, 3L, 2L, 3L, 2L, 2L), .Label = c("19A", "20A", "20B", "20E (EU1)", "20I (Alpha, V1)", "20J (Gamma, V3)", "21J (Delta)"), class = "factor"), C.T = c(0, 4, 4, 4, 4, 4), A.G = c(0, 1, 1, 1, 1, 1), G.A = c(0, 2, 0, 2, 0, 0), G.C = c(0, 1, 0, 1, 0, 0), T.C = c(0, 0, 0, 0, 0, 0), C.A = c(0, 0, 0, 0, 0, 0), G.T = c(0, 0, 0, 0, 0, 0), A.T = c(0, 0, 0, 0, 0, 0), T.A = c(0, 0, 0, 0, 0, 0), T.G = c(0, 0, 0, 0, 0, 0), A.C = c(0, 0, 0, 0, 0, 0), C.G = c(0, 0, 0, 0, 0, 0), A.del = c(0, 0, 0, 0, 0, 0), TAT.del = c(0, 0, 0, 0, 0, 0), TCTGGTTTT.del = c(0, 0, 0, 0, 0, 0), TACATG.del = c(0, 0, 0, 0, 0, 0), AGTTCA.del = c(0, 0, 0, 0, 0, 0), GATTTC.del = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, -6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000014b25a51ef0>) I'd like to create 7 bar plots (one for each "clade") where the X axis would have the columns of the data frame (C.T would be 1 bar, A.G would be another bar, etc) and the Y axis would be the count. Essentially, for each clade, print a barplot with the counts of column. For example, for the bar plot of the clade "20B" and the bar name "C.T" the count would be the sum of the values from the data frame. Can I do that in this wide format? Do I need to transform the data to a long format instead? I was trying to apply this SO answer: Plotting error bar on bar chart for a data frame in wide format using ggplot but I keep getting choose another strategy with names_repair Thank you in advance, any help is very welcome!
How can I make this replacement of values based on order more computationally efficient in R? [duplicate]
This question already has answers here: Get value of a matrix with row-index and column-index [duplicate] (2 answers) Closed 2 years ago. I have a df that of 32 columns and just under a million rows. The columns are the POINTID (individual id), First (year that an event first happened), and then 30 columns of years w binary occurrence data. I would like the first occurrence in each row (currently stored as a 1, same as all other occurrences) to be changed to a 2, so that I can differentiate between the first event and repeat events. I've tried doing this with the tidyverse, but even then it is taking forever. I can't tell if my code is just wrong or if it's not computationally efficient enough. I tested it on a smaller dataset and it seemed to work, in the long format but not the wide, so I'm thinking it's an efficiency issue because the pivot_longer table generated is about about 35 million rows long. Can anyone help me understand why this isn't working or how to do it in a way that computes faster? classifications %>% pivot_longer(-c(1,32),names_to="Years", values_to="Present")%>% group_by(POINTID)%>% mutate(Present=replace(Present, Years==first, 2)) A reduced version of my DF is below: > dput(classifications) structure(list(POINTID = 2:11, first = structure(c(33L, 33L, 33L, 33L, 1L, 33L, 33L, 1L, 1L, 36L), .Label = c("X1985", "X1986", "X1987", "X1988", "X1989", "X1990", "X1991", "X1992", "X1993", "X1994", "X1995", "X1996", "X1997", "X1998", "X1999", "X2000", "X2001", "X2002", "X2003", "X2004", "X2005", "X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012", "X2013", "X2014", "X2015", "X2016", "X2017", "X2018", "X2019", "X2020"), class = "factor"), X1990 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1991 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1992 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1993 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1994 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1995 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1996 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1997 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1998 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1999 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2000 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2001 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2002 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2003 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2004 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2005 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2006 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2007 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2008 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2009 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2010 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2011 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2012 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2013 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2014 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2015 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2016 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2017 = c(1, 1, 1, 1, 0, 1, 1, 0, 0, 0), X2018 = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2019 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2020 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1)), row.names = c(NA, 10L), class = "data.frame")
You can do this keeping the data in wide format with vectorised operations of row/column subsetting. We get the column index using match. mat <- cbind(1:nrow(classifications), match(classifications$first, names(classifications))) classifications[mat] <- 2
R apply funciton on each cell in data frame
I have a data frame that look something like this > dput(tes) structure(list(path = structure(1:6, .Label = c("1893-chicago-fair", "1960s-afghanistan", "1970s-iran", "1980s-new-york", "20-bizarre-vintage-ads", "20-bizarre-vintage-ads?utm_campaign=6678&utm_medium=rpages&utm_source=Facebook&utm_term=1e8e704f7b587515c72e6cf7895d55fd110b652c480d98c1440f0a7acba5fb0e", "20-photos-segregation-america-show-far-weve-come-much-farther-go", "7-bizarre-cultural-practices", "7-creepy-abandoned-cities?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=4015a7368b588ff09694c96ba720c58f4e7f41a05b4181908b582bae682bef5e", "a-brief-history-of-hippies", "abandoned-photographs", "albert-kahn", "amazing-facts", "american-bison-extinction-1800s", "american-english-vs-british-english", "andre-the-giant-photos", "andre-the-giant-photos??utm_source=facebook&sr_source=lift_facebook&utm_campaign=simplereach_andre-the-giant-photos&utm_medium=social", "andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e", "andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_142deb68f67fb1a99e7b80250fecc932&utm_medium=referral&utm_source=gravity", "andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_16d63cf07ecf656f602b2d6b209344f7&utm_medium=referral&utm_source=gravity", "andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_713050ecffc51540af02b2246ddf57dd&utm_medium=referral&utm_source=gravity", "andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_c5bb3bc5e9408e0ad52ec9e787bd8654&utm_medium=referral&utm_source=gravity", "andre-the-giant-photos?sr_source=lift_facebook&utm_campaign=simplereach_andre-the-giant-photos&utm_medium=social&utm_source=facebook", "astounding-aerial-photography", "astounding-aerial-photography?utm_campaign=7002&utm_medium=rpages&utm_source=Facebook&utm_term=38e9e903d9ba59106d8b4d19be593f3de7ff8b91b12eafa03f2e382228f7b0d1", "august-landmesser", "ben-franklin", "best-all-that-is-interesting-articles", "bigfoot-facts", "celebrity-school-photos?grvVariant=82c0ce57a33dfd0209bdefc878665de0&utm_campaign=gravityus2_bc8646aefd6d0a16af03d7caf248f226&utm_medium=referral&utm_source=gravity", "coolest-mushrooms?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola", "craziest-ways-drugs-smuggled", "creepy-halloween-costumes", "danakil-depression", "dark-john-lennon-quotes", "david-bowie-quotes", "days-in-groundhog-day", "death-photos", "death-photos?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola", "dr-seuss-quotes", "dream-chaser-spacecraft", "dust-bowl", "earth-two-planets", "eixample-barcelona", "email-to-space", "evil-science-experiments", "famous-incest", "famous-spies", "fun-facts-trivia", "golden-age-air-travel?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola", "gross-foods", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=106965c54919c24bf37356500ec50f0709b1de621d6950bb4c5d48759ea3677e", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=184e0ee39e66af82f9b124b904f6e07964b211e902cb0dc00c28771ff46163a2", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=538659f1fc53f28d2c87b93ac73973681c1a46a04954964ab6c52ed1ab09b33a", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=87caf0acb91ae2b202f1b00ad9eaad3fef20bbfb23405b9047fb2b5a5462ab9c", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=91eae42c8fc9568103d46e0b6b6ec08fc34fd68b2e1918ffe2333ec73035c95a", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=a72946874b2003a8e40635c6cf10c851d4e1c0ed45e645d69663214239550602", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=ab594f0a1be002c8c3db297e8d33b04678af40e6a6469ac815884ae0a014b3a3", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=fb1e333dd58cb7bb9251ec52290aae21771149f73e083440047068a69aaeae09", "hilarious-insults", "hippie-communes", "hippie-communes?grvVariant=fda07538efb1c25617f7cc3d09c37c79", "hippie-communes?grvVariant=fda07538efb1c25617f7cc3d09c37c79&utm_campaign=gravityus2_e3cd42d4745768460dab4694a972fd82&utm_medium=referral&utm_source=gravity", "hippie-communes?pp=0", "history-of-the-vibrator", "history-of-the-vibrator?utm_campaign=whfbpd&utm_medium=social&utm_source=facebook", "homosexuality-norm", "hunger-games-facts?utm_campaign=6905&utm_medium=rpages&utm_source=Facebook&utm_term=1a9e42ac8abb6ffa90bf0542206505e74d3df12114a2c4445527fb2b88ef8880", "influential-photographs", "ingeniously-creative-ads", "insane-cults", "insane-rulers", "inspirational-quotes", "inspirational-quotes?utm_medium=referral&utm_source=taboolainternal", "interesting-facts-about-the-world", "interesting-quotes", "krokodil", "making-a-murderer-theories", "maya-angelou-greatest-quotes", "medieval-torture-devices", "milky-way-colorado", "montreal-metro", "most-popular-female-names-in-america", "neil-degrasse-tyson-tweets", "new-york-city-cinemagraphs", "new-york-subways-1980s", "north-korea-photographs", "north-korea-photographs?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola", "north-korea-photographs?utm_medium=referral&utm_source=taboolainternal", "obama-aging", "pablo-escobar", "pablo-escobar??utm_source=facebook", "pablo-escobar??utm_source=facebook&sr_source=lift_facebook&utm_campaign=simplereach_pablo-escobar&utm_medium=social", "pablo-escobar?utm_campaign=whfbpd&utm_medium=social&utm_source=facebook", "panda-facts", "photo-of-the-day-nasa-releases-crystal-clear-image-of-pluto", "pollution-in-china-photographs", "pollution-in-china-photographs?utm_campaign=3434&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59", "pollution-in-china-photographs?utm_campaign=3434&utm_medium=rpages&utm_source=Facebook&utm_term=e28a76c1572c36c3a13965e52b4b2ea10518eb9f9c79c4bc84cfb85db16be81e", "pollution-in-china-photographs?utm_campaign=6806&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59", "pollution-in-china-photographs?utm_campaign=7048&utm_medium=rpages&utm_source=Facebook&utm_term=2ef4bd7b6cd587601d6eeb35925282a1ed095ebbd4e9e4c0337ef868c7de7a0b", "pollution-in-china-photographs?utm_campaign=7458&utm_medium=rpages&utm_source=Facebook&utm_term=b9e79a51cd4daf4c3ec02accce75b3e1fc9a22cb3133460c9c32a4f2f9cdb68c", "powerful-photos-of-2014", "real-x-files", "romanovs-last-days", "science-of-human-decay", "scientific-discoveries-2015", "scully-effect", "serial-killer-quotes", "shah-iran", "six-of-the-craziest-gods-in-mythology", "space-facts", "sun-facts", "sunken-cities", "sunken-ships", "super-bowl-i-facts", "superhero-movies", "surreal-places", "syrian-civil-war-photographs", "the-five-greatest-mysteries-of-human-history", "the-four-most-important-battles-of-ancient-greece", "the-most-colorful-cities-in-the-world", "titanic-facts", "titanic-facts?utm_campaign=6385&utm_medium=rpages&utm_source=Facebook&utm_term=f5905e878216d14e20457ee3265caf6c10022d9545609edfb9a3cb0642c1a310", "titanic-facts?utm_campaign=6899&utm_medium=rpages&utm_source=Facebook&utm_term=b9e79a51cd4daf4c3ec02accce75b3e1fc9a22cb3133460c9c32a4f2f9cdb68c", "titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=106965c54919c24bf37356500ec50f0709b1de621d6950bb4c5d48759ea3677e", "titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=538659f1fc53f28d2c87b93ac73973681c1a46a04954964ab6c52ed1ab09b33a", "titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=91eae42c8fc9568103d46e0b6b6ec08fc34fd68b2e1918ffe2333ec73035c95a", "titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=ab594f0a1be002c8c3db297e8d33b04678af40e6a6469ac815884ae0a014b3a3", "titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=d1864657a05e5b716bb5cb16a29f068a55652eb39fb669ea9c22a6486198f227", "titanic-facts?utm_campaign=7292&utm_medium=rpages&utm_source=Facebook&utm_term=f5905e878216d14e20457ee3265caf6c10022d9545609edfb9a3cb0642c1a310", "us-veterans-portraits", "vintage-disneyland", "wall-street-early-20th-century", "what-we-love-this-week-the-incredible-last-words-of-famous-historical-figures", "woodstock-photos", "zombie-proof-house"), class = "factor"), `0089` = c(0, 0, 0, 0, 0, 1), `0096` = c(0, 0, 0, 0, 0, 0 ), `02` = c(0, 0, 0, 0, 0, 0), `0215` = c(0, 0, 0, 0, 0, 0), `0225` = c(0, 0, 0, 0, 0, 0), `0252` = c(0, 0, 0, 0, 0, 0), `0271` = c(0, 0, 0, 0, 0, 0), `0272` = c(0, 0, 0, 0, 0, 0), `03` = c(0, 0, 0, 0, 1, 1)), .Names = c("path", "0089", "0096", "02", "0215", "0225", "0252", "0271", "0272", "03"), row.names = c(NA, 6L), class = "data.frame") and I need to apply the min(x,1) function such that this function scan each value in the dataframe (except first column which is not numeric) and return the min(x,1). that way I have only zero's and one's. I have tried: f <- function(x) min(1,x) res1<-do.call(f,tes[,2:ncol(tes)]) but that does not output the right result. Any help aapreciated
We can use pmin tes[,-1] <- pmin(1, as.matrix(tes[,-1])) Or if we need only binary values tes[,-1] <- +(!!tes[,-1])