I'm trying to write a program in R language and i use for loops and if statement
i have a data that contain 17 rows and 1091 columns (ariables)
I want to compare the values of a the 17th row and put the columns that have the same values in one data fram to treate them after
the algorithm i though of contain the following steps :
1-Take the column i want to compare and put it in new data frame (Sub_data)
2- compare the value in the 17th of this column with all the others values of other columns in the first data (All_data)
3-when the value of the column equal to the value of any other column (B) take that column B and add it to the data frame
4-after that i want to compare the variation of the variables in the Sub_data (that contains the same values of the 17th rows) and chose one column of the columns that has the same variation and eliminate the others
Here i present the rows and the first two columns of my data ( All_data)
MT95T843 MT95T756
QC_G.F9_01_4768 70027.0213162601 95774.1359666849
QC_G.F9_01_4765 69578.1863357392 81479.2957458262
QC_G.F9_01_4762 69578.1863357392 87021.9542724389
QC_G.F9_01_4759 68231.1433794304 95558.7673782843
QC_G.F9_01_4756 64874.1293568862 96780.772452217
QC_G.F9_01_4753 63866.6577969569 91854.3530432699
CtrF01R5_G.D1_01_4757 66954.3879935821 128861.361627886
CtrF01R4_G.D5_01_4763 97352.5522885788 101353.25926633
CtrF01R3_G.C8_01_4754 61311.7857641721 7603.60895516428
CtrF01R2_G.D3_01_4760 85768.3611731878 109461.75444564
CtrF01R1_G.C9_01_4755 85302.8194715206 104253.845374077
BtiF01R5_G.D7_01_4766 61252.4254487766 115683.737549183
BtiF01R4_G.D6_01_4764 81873.9637852956 112164.142293011
BtiF01R3_G.D2_01_4758 84981.2191408476 0
BtiF01R2_G.D4_01_4761 36629.0246187626 124806.491006666
BtiF01R1_G.D8_01_4767 0 109927.264246577
rt 13.9018138671285 13.9058590777331
Code for input dataframe :
df1 <- data.frame(Name = c("QC_G.F9_01_4768", "QC_G.F9_01_4765", "QC_G.F9_01_4762", "QC_G.F9_01_4759", "QC_G.F9_01_4756", "QC_G.F9_01_4753",
"CtrF01R5_G.D1_01_4757", "CtrF01R4_G.D5_01_4763", "CtrF01R3_G.C8_01_4754", "CtrF01R2_G.D3_01_4760", "CtrF01R1_G.C9_01_4755",
"BtiF01R5_G.D7_01_4766", "BtiF01R4_G.D6_01_4764", "BtiF01R3_G.D2_01_4758", "BtiF01R2_G.D4_01_4761", "BtiF01R1_G.D8_01_4767",
"rt"),
MT95T843 = c(70027.0213162601, 69578.1863357392, 69578.1863357392, 68231.1433794304, 64874.1293568862, 63866.6577969569, 66954.3879935821,
97352.5522885788, 61311.7857641721, 85768.3611731878, 85302.8194715206, 61252.4254487766, 81873.9637852956, 84981.2191408476,
36629.0246187626, 0, 13.9018138671285),
MT95T756 = c(95774.1359666849, 81479.2957458262, 87021.9542724389, 95558.7673782843, 96780.772452217, 91854.3530432699, 128861.361627886,
101353.25926633, 7603.60895516428, 109461.75444564, 104253.845374077, 115683.737549183, 112164.142293011, 0, 124806.491006666,
109927.264246577, 13.9058590777331))
df1
#> Name MT95T843 MT95T756
#> 1 QC_G.F9_01_4768 70027.02132 95774.13597
#> 2 QC_G.F9_01_4765 69578.18634 81479.29575
#> 3 QC_G.F9_01_4762 69578.18634 87021.95427
#> 4 QC_G.F9_01_4759 68231.14338 95558.76738
#> 5 QC_G.F9_01_4756 64874.12936 96780.77245
#> 6 QC_G.F9_01_4753 63866.65780 91854.35304
#> 7 CtrF01R5_G.D1_01_4757 66954.38799 128861.36163
#> 8 CtrF01R4_G.D5_01_4763 97352.55229 101353.25927
#> 9 CtrF01R3_G.C8_01_4754 61311.78576 7603.60896
#> 10 CtrF01R2_G.D3_01_4760 85768.36117 109461.75445
#> 11 CtrF01R1_G.C9_01_4755 85302.81947 104253.84537
#> 12 BtiF01R5_G.D7_01_4766 61252.42545 115683.73755
#> 13 BtiF01R4_G.D6_01_4764 81873.96379 112164.14229
#> 14 BtiF01R3_G.D2_01_4758 84981.21914 0.00000
#> 15 BtiF01R2_G.D4_01_4761 36629.02462 124806.49101
#> 16 BtiF01R1_G.D8_01_4767 0.00000 109927.26425
#> 17 rt 13.90181 13.90586
I'm stuck in the third step where i got this error message
Error in Sub_data[1, i] : subscript out of bounds
Here's the code i used :
library("readxl")
library("janitor")
All_data <- read_excel("DataMatrix_Excel.xlsx")
dim(All_data)
17 1091
for(i in 1:1091){
#Add column
Sub_data <- cbind(All_data[ , 1, drop=F])
for(j in 2:1091){
if(Sub_data[17,1]==All_data[17,j]) {
Sub_data <- cbind(Sub_data,All_data[ , j, drop=F])
#I added this line just to see if my code work
print(paste("The dim is " , dim(Sub_data)))
}
Please tell me if you need any more informations or clarification, also please tell me if you need any suggestions
Thank you very much
I have a list of lists that looks like this:
> class(cladelist)
[1] "list"
cladelist <- list( `46` = scan(text=' "KbFk2" "PeHa3" "PeHa51" "EeBi27" "EeBi17" "PeHa23" "PeHa44" "EeBi4" "EeBi26" "PeHa8" "PeHa26" "EeBi24" "EeBi3"
"EeBi20" "KbFk5" "PeHa15" "PeHa43" "PeHa11" "PeHa12" "PeHa49" "PeHa67" "PeHa17" "PeHa59" "KbFk4" "PeHa10" "PeHa55"
"PeHa73" "EeBi23" "PeHa78" "PeHa81" "EeBi11" "PeHa45" "EeBi6" "EeBi34" "PeHa25" "PeHa52" "PeHa62" "PeHa31" "PeHa65"
"PeHa47" "PeHa50" "PeHa34" "PeHa54" "PeHa22" "PeHa30"', what=""),
`47`= scan(text='
"KbFk2" "EeBi27" "EeBi17" "EeBi4" "EeBi26" "EeBi3" "EeBi20" "KbFk5" "KbFk4" "EeBi6" "EeBi34"', what=""),
`48`= scan(text=' "PeHa3" "PeHa51" "PeHa23" "PeHa44" "PeHa8" "PeHa26" "EeBi24" "PeHa15" "PeHa43" "PeHa11" "PeHa12" "PeHa49" "PeHa67"
"PeHa17" "PeHa59" "PeHa10" "PeHa55" "PeHa73" "EeBi23" "PeHa78" "PeHa81" "EeBi11" "PeHa45" "PeHa25" "PeHa52" "PeHa62"
"PeHa31" "PeHa65" "PeHa47" "PeHa50" "PeHa34" "PeHa54" "PeHa22" "PeHa30"', what=""),
`49`= scan(text=' "PeHa51" "PeHa23" "PeHa44" "PeHa8" "PeHa26" "EeBi24" "PeHa15" "PeHa43" "PeHa11" "PeHa12" "PeHa49" "PeHa67" "PeHa17"
"PeHa59" "PeHa10" "PeHa55" "PeHa73" "EeBi23" "PeHa78" "PeHa81" "EeBi11" "PeHa45" "PeHa25" "PeHa52" "PeHa62" "PeHa31"
"PeHa65" "PeHa47" "PeHa50" "PeHa34" "PeHa54" "PeHa22" "PeHa30"', what=""),
`50`= scan(text=' "EeBi27" "EeBi17" "EeBi4" "EeBi26" "EeBi3" "EeBi20" "KbFk5" "KbFk4" "EeBi6" "EeBi34"', what="") )
Each of these sublists (ie "46", "47" etc) represents a clade in a dendogram that I've extracted using:
> cladelist <- clade.members.list(VB.phy, tips = FALSE, tip.labels = TRUE, include.nodes=FALSE)
Im trying to find each unique pair found within each sublist, and calculate the sum of times it appears between all sublists (clades).
The ideal output would be a dataframe that looks like this where the count is the number of times this pair was found between all sublists (clades):
Pair Count
Peha1/PeHa2 2
Peha1/PeHa3 4
PeHa1/PeHa4 7
PeHa1/PeHa5 3
What sort of formulas am I looking for?
Background for the question (just for interest, doesnt add that much to question):
The idea is that I have a data set of 121 of these elements (Peha1, KbFk3, etc). They are artifacts (I'm an archaeologist) that I'm evaluating using 3D geometric morphometrics. The problem is that these artifacts are not all complete; they are damaged or degraded and thus provide an inconsistent amount of data. So I've had to reduce what data I use per artifact to have a reasonable, yet still inconsistent, sample size. By selecting certain variables to evaluate, I can get useful information, but it requires that I test every combination of variables. One of my analyses gives me the dendograms using divisive hierarchical clustering.
Counting the frequency of each pair as found between each clade should be the strength of the relationship of each pair of artifacts. That count I will then divide by total number of clades in order to standardize for the following step. Once I've done this for X number of dendograms, I will combine all these values for each pair, and divide them by the number representing whether that pair appeared in a dendogram (if it shows up in 2 dendograms, that I divide by 2), because each pair will not appear in each of my tests and I have to standardize it so that more complete artifacts that appear more often in my tests don't have too much more weight. This should allow me to evaluate which pairs have the strongest relationships.
This falls into a set of association kind of problems for which I find the widyr package to be super useful, since it does pairwise counts and correlations. (The stack() function just converts into a dataframe for the rest to flow).
I couldn't check against your sample output, but for an example like "PeHa23/PeHa51", the output shows they are paired together in 3 different clades.
This currently doesn't include zero counts to exhaust all possible pairs, but that could be shown as well (using complete()).
UPDATE: Made references clearer for packages like dplyr, and filtered so that counts are non-directional (item1-item2 is same as item2-item1 and can be filtered).
library(tidyverse)
library(widyr)
df <- stack(cladelist) %>%
dplyr::rename(clade = "ind", artifact = "values")
df %>%
widyr::pairwise_count(feature = clade, item = artifact) %>%
filter(item1 > item2) %>%
mutate(Pair = paste(item1, item2, sep = "/")) %>%
dplyr::select(Pair, Count = n)
#> # A tibble: 990 x 2
#> Pair Count
#> <chr> <dbl>
#> 1 PeHa3/KbFk2 1
#> 2 PeHa51/KbFk2 1
#> 3 PeHa23/KbFk2 1
#> 4 PeHa44/KbFk2 1
#> 5 PeHa8/KbFk2 1
#> 6 PeHa26/KbFk2 1
#> 7 KbFk5/KbFk2 2
#> 8 PeHa15/KbFk2 1
#> 9 PeHa43/KbFk2 1
#> 10 PeHa11/KbFk2 1
#> # … with 980 more rows
I'm beginner dealing with R and working with strings.
I've been trying to remove periods from data but unfortunately I can't find a solution.
This is the data I'm working on in a dataframe df:
df <- read.table(text = " n mesAno receita
97 1/2009 3.812.819.062,06
98 2/2009 4.039.362.599,36
99 3/2009 3.652.885.587,18
100 4/2009 3.460.247.960,02
101 5/2009 3.465.677.403,12
102 6/2009 3.131.903.622,55
103 7/2009 3.204.983.361,46
104 8/2009 3.811.786.009,24
105 9/2009 3.180.864.095,05
106 10/2009 3.352.535.553,88
107 11/2009 5.214.148.756,95
108 12/2009 4.491.795.201,50
109 1/2010 4.333.557.619,30
110 2/2010 4.808.488.277,86
111 3/2010 4.039.347.179,81
112 4/2010 3.867.676.530,69
113 5/2010 6.356.164.873,94
114 6/2010 3.961.793.391,19
115 7/2010 3797656130.81
116 8/2010 4709949715.37
117 9/2010 4047436592.12
118 10/2010 3923484635.28
119 11/2010 4821729985.03
120 12/2010 5024757038.22",
header = TRUE,
stringsAsFactors = TRUE)
My objective is to transform receita column to numeric as it's is being stored as factor. But applying conversion functions like as.numeric(as.factor(x)) does not work in the interval 97:114 (it coerces to NA's).
I suppose that this is because of the periods separating billion/million/thousands in this column.
The mentioned conversion functions will work only if I have something like 3812819062.06 as in 115:120.
I tried mutating the dataset adding another column and modelling.
I don't really know if what i'm doing is fine, but i also tried extracting the anomalous numbers to a variable, and applying sub/gsub on them but without success.
Is there some straight forward way of doing this, that is, instruct it to remove the 2 first occurrences of '.' and then replace the comma with a '.'?
I'm very confident that the function i'm needing is gsub but i'm having a hard time finding the correct usage. Any help will be appreciated.
Edit: My approach using dplyr::mutate(). Ugly but works.
df <- df %>%
mutate(receita_temp = receita) %>%
mutate(dot_count = str_count(receita, '\\.')) %>%
mutate(receita_temp = ifelse(dot_count == 3,
gsub('\\.', '', as.factor(receita_temp)),
gsub('\\,', '.',as.factor(receita_temp))
)) %>%
mutate(receita_temp = ifelse(dot_count == 3,
gsub('\\,', '.',as.factor(receita_temp)),
receita_temp)) %>%
select(-c(dot_count, receita)) %>%
rename(., receita = receita_temp)
I'm using regex and some stringr functions to remove all the periods except those followed by two digits and the end of the string. That way, periods denoting separation like in 3.811.786.009,24 are removed, but periods denoting the start of a decimal like in 4821729985.03 are not. Using str_remove_all rather than str_remove lets me not have to worry about removing the matches repeatedly or about how well it will scale. Then replace the remaining commas with periods, and make it numeric.
library(tidyverse)
df2 <- df %>%
mutate(receita = str_remove_all(receita, "\\.(?!\\d{2,}$)") %>%
str_replace_all(",", ".") %>%
as.numeric())
print(head(df2), digits = 12)
#> n mesAno receita
#> 1 97 1/2009 3812819062.06
#> 2 98 2/2009 4039362599.36
#> 3 99 3/2009 3652885587.18
#> 4 100 4/2009 3460247960.02
#> 5 101 5/2009 3465677403.12
#> 6 102 6/2009 3131903622.55
Created on 2018-09-04 by the reprex package (v0.2.0).
You can use the following:
first create a function that will be used for replacement:
repl = function(x)setNames(c("","."),c(".",","))[x]
This function takes in either "." or "," and returns "" or '.' respectively
Now use this function to replace
stringr::str_replace_all(as.character(df[,3]), "[.](?!\\d+$)|,", repl)
[1] "3812819062.06" "4039362599.36" "3652885587.18" "3460247960.02" "3465677403.12" "3131903622.55"
[7] "3204983361.46" "3811786009.24" "3180864095.05" "3352535553.88" "5214148756.95" "4491795201.50"
[13] "4333557619.30" "4808488277.86" "4039347179.81" "3867676530.69" "6356164873.94" "3961793391.19"
[19] "3797656130.81" "4709949715.37" "4047436592.12" "3923484635.28" "4821729985.03" "5024757038.22"
Of course you can do the rest. ie calling as.numeric() etc.
To do this in base R:
sub(',','.',gsub('[.](?!\\d+$)','',as.character(df[,3]),perl=T))
or If you know the exact number of . and , in your data, you could do
a = as.character(df[,3])
regmatches(a,gregexpr('[.](?!\\d+$)|,',df[,3],perl = T)) = list(c("","","","."))
a
df$num <- as.numeric(sapply(as.character(si), function(x) gsub("\\,","\\.",ifelse(grepl("\\,", x), gsub("\\.","",x),x))))
should do the trick.
First, the function searches for rows with ",", removes "." in these rows, and last it converts all occurring "," into ".", so that it can be converted without problems to numeric.
Use print(df$num, digits = 12) to see your data with 2 decimals.
I have 2 dataframes df1 and df2.
df1 contains 2 columns - t1 and data1, with t1 starting from 0.0001 till 75, with an increment of 0.0001. So it goes like 0.0001, 0.0002, 0.0003..... 74.9999, 75.0000. data1 is just some numbers between 0 and 1.
df2 also contains 2 columns - t2 and data2, but the length of each column is 114 - only selected values between 0.0001 and 75 are present in the time column - eg. 14.6000,15.2451,....73.4568. data2 is again some random numbers with length of 114
I have extracted the values of t2 from another data set
t2<- c(14.6000, 14.6001, 14.6002, 14.6002, 14.6007, 14.6011, 14.6016, 14.602, 14.6037, 14.6055, 14.6072, 14.6089, 14.6151, 14.6214, 14.6277, 14.6339, 14.6402, 14.6545, 14.6688, 14.6831, 14.6974, 14.7117, 14.7261, 14.7573, 14.7886, 14.8199, 14.8511, 14.8824, 14.9137, 14.9681, 15.0225, 15.0768, 15.1312, 15.1856, 15.24, 15.3233, 15.4065, 15.4897, 15.573, 15.6562, 15.7394, 15.8768, 16.0142, 16.1516, 16.289, 16.4264, 16.5638, 16.7676, 16.9715, 17.1753, 17.3792, 17.583, 17.7868, 17.9907, 18.3366, 18.6826, 19.0285, 19.3745, 19.7204, 20.0664, 20.4124, 20.9122, 21.412, 21.9118, 22.4116, 22.9114, 23.4112, 23.911, 24.5965, 25.282, 25.9675, 26.653, 27.3385, 28.024, 29.1158, 30.2075, 31.2993, 32.3911, 33.4828, 34.6828, 35.8828, 37.0828, 38.2828, 39.4828, 40.6828, 41.8828, 43.0828, 44.2828, 45.4828, 46.6828, 47.8828, 49.0828, 50.2828, 51.4828, 52.6828, 53.8828, 55.0828, 56.2828, 57.4828, 58.6828, 59.8828, 61.0828, 62.2828, 63.4828, 64.6828, 65.8828, 67.0828, 68.2828, 69.4828, 70.6828, 71.8828, 73.0828, 74.2828,74.6000)
df1<- data.frame("t1"=seq(0.0001,75,0.0001), "data1"=c(rnorm(750000)))
df2<- data.frame("t2"=t2, "data2"=c(rnorm(length(t2))))
I want to create a new dataframe - df_new , in which I want to pick the values of t2 and the corresponding data1 values from df1
df_new<- subset(df1,t1 %in% df2$t2)
When I do this, df_new has only 74 observations, instead of 114. Am I doing something wrong here?
This seems to be a problem with floating point arithmetic. See two examples below. In general, directly comparing floats like this is not necessarily going to be robust because the accuracies of the representation isn't perfect. I picked the first element in df2$t2 that doesn't line up as expected. You would hope that the first == comparison would return true but it doesn't. See that all.equal, which confusingly tests "near equality", does in fact return true for the two objects I pulled out. You can see that there is a difference by changing the digits printed with options.
One way to get the intended result is to use round to make all the numbers you want the same. Note that there are only 113 rows in your output because there are only 113 unique values in df2$t2 as provided. You might also consider converting to integers (with correspondingly smaller units).
t2<- c(14.6000, 14.6001, 14.6002, 14.6002, 14.6007, 14.6011, 14.6016, 14.602, 14.6037, 14.6055, 14.6072, 14.6089, 14.6151, 14.6214, 14.6277, 14.6339, 14.6402, 14.6545, 14.6688, 14.6831, 14.6974, 14.7117, 14.7261, 14.7573, 14.7886, 14.8199, 14.8511, 14.8824, 14.9137, 14.9681, 15.0225, 15.0768, 15.1312, 15.1856, 15.24, 15.3233, 15.4065, 15.4897, 15.573, 15.6562, 15.7394, 15.8768, 16.0142, 16.1516, 16.289, 16.4264, 16.5638, 16.7676, 16.9715, 17.1753, 17.3792, 17.583, 17.7868, 17.9907, 18.3366, 18.6826, 19.0285, 19.3745, 19.7204, 20.0664, 20.4124, 20.9122, 21.412, 21.9118, 22.4116, 22.9114, 23.4112, 23.911, 24.5965, 25.282, 25.9675, 26.653, 27.3385, 28.024, 29.1158, 30.2075, 31.2993, 32.3911, 33.4828, 34.6828, 35.8828, 37.0828, 38.2828, 39.4828, 40.6828, 41.8828, 43.0828, 44.2828, 45.4828, 46.6828, 47.8828, 49.0828, 50.2828, 51.4828, 52.6828, 53.8828, 55.0828, 56.2828, 57.4828, 58.6828, 59.8828, 61.0828, 62.2828, 63.4828, 64.6828, 65.8828, 67.0828, 68.2828, 69.4828, 70.6828, 71.8828, 73.0828, 74.2828,74.6000)
set.seed(12345)
df1<- data.frame("t1"=seq(0.0001,75,0.0001), "data1"=c(rnorm(750000)))
df2<- data.frame("t2"= t2, "data2"=c(rnorm(length(t2))))
df2$t2[2]
#> [1] 14.6001
df1$t1[146001]
#> [1] 14.6001
df1$t1[146001] == df2$t2[2]
#> [1] FALSE
all.equal(df1$t1[146001], df2$t2[2])
#> [1] TRUE
options(digits = 22)
df2$t2[2]
#> [1] 14.600099999999999
df1$t1[146001]
#> [1] 14.600100000000001
df_new_rnd <- subset(df1, round(t1, 4) %in% round(df2$t2, 4))
df_new_int <- subset(df1, as.integer(t1 * 10000) %in% as.integer(df2$t2 * 10000))
nrow(df_new_rnd)
#> [1] 113
nrow(df_new_int)
#> [1] 113
Created on 2018-05-22 by the reprex package (v0.2.0).