is there a formula to transpose one cell based on another - formula

If T2= "5"
then
A2(on a different sheet)= N2,
B2(on different sheet)= N2,
C2(on different sheet)= N2,
D2(on different sheet)= N2,
E2(on different sheet)= N2
then
if T3= "2"
F2(on different sheet)=N2
G2(on different sheet)=N2

There are 3 formulas for this:
Only T2=5 or and T2 = 3 get give five, three, eight or zero columns will be N2
={if(MainSheet!T2=5,{MainSheet!N2,MainSheet!N2,MainSheet!N2,MainSheet!N2,MainSheet!N2},{"","","","",""}),if(MainSheet!T3=2,{MainSheet!N2,MainSheet!N2},{"",""})}
T2+T3 columns will be N2
=split(rept(MainSheet!N2 & ",",MainSheet!T2+MainSheet!T3),",",true,true)
T2 in 5 columns will be N2 and T3 after first 5 columns will be N2
=split({rept(MainSheet!N2 & ",",MainSheet!T2) & rept(",",5-MainSheet!T2) & rept(MainSheet!N2 & ",",MainSheet!T3) & rept(",",2-MainSheet!T3)},",",true,false)

Related

Compare two colums in diffrents tables

Say I have two tables, Table A and Table B, and I want to compare a certain column.
For example,
Table A has the columns:
Name,Surname ,Family, species
Table B has the columns:
IP,Genes,Types,Species,Models
How do I compare the Species column between the two tables to get the matches , that means that i want to extract name of species that exist in both tables?
for exemple if the first species column have
a b c d e f g h i
information and the second species colum have
k l m n a b y i l
i want this result :
a b i
Can you tell me please the way i can do that , and also if there s anyway i can do it without usin join
Thank you very much
Try any of these options. I have used dummy data:
#Data
TableA <- data.frame(Species=c('a','b','c','d','e','f','g','h','i'),
Var=1,stringsAsFactors = F)
TableB <- data.frame(Species=c('k','l','m','n','a','b','y','i','l'),
Var2=2,stringsAsFactors = F)
#Option1
TableA$Species[TableA$Species %in% TableB$Species]
#Option 2
intersect(TableA$Species,TableB$Species)
In both cases the output will be:
[1] "a" "b" "i"

How to count the number of ordered sequence in a vector in R

I have the following data set as an example.
Data<-c("a","d","c","b","d","a","d","b","c","b","a","b","d","c","b","a","c","d","d","c")
I need to find out how many times a < b < c < d, all possible combinations.
I have created a loop (using 4 IF functions) but it takes long time when we have large data set. With this loop the answer is 47 times.
Is there any efficient way to do so in R.
Here is my attempt, but it is very slow when say we have more than 4 letters and large data set.
Data<-c("a","d","c","b","d","a","d","b","c","b","a","b","d","c","b","a","c","d","d","c")
set.seed(123)
Data0<-sort(sample(1:100,20))
df<-data.frame(Data,Data0)
A<-df[Data=="a",2]
B<-df[Data=="b",2]
C<-df[Data=="c",2]
D<-df[Data=="d",2]
myfun<-function(A,B,C,D){
b0<-0
f0<-0
for (i in 1:length(A)){
for (j in 1:length(B)){
for (k in 1:length(C)){
for (l in 1:length(D)){
f0<-f0+sum(A[i]<B[j])*sum(B[j]<C[k])*sum(C[k]<D[l])
}}}}
return(f0)
}
myfun(A,B,C,D)
Many Thanks
You could use split to divide Data0 based on groups i.e Data, create all possible combinations using expand.grid and count number of times a < b < c < d.
temp <- do.call(expand.grid, split(df$Data0, df$Data))
sum(with(temp, a < b & b < c & c < d))
#[1] 47
If there are many columns and we do not want to check it manually, we can use apply and for every row check the difference between consecutive elements with diff and count number of occurrences when all the values are higher than the previous values.
sum(apply(temp, 1, function(x) all(diff(x) > 0)))
#[1] 47

Replace blank values in one column based on partial string values of another in R

I have data that looks like the following:
pig<-data.frame(a=c("","","","","","Type1"),b=c("T1 NR","T2 NR","T2","T3","T3","Type1"))
print(pig)
a b
T1 NR
T2 NR
T2
T3
T3
Type1 Type 1
Where a will sometimes be blank, but b always has information that corresponds to the missing information in a. I am looking for two things. First, where "NR" is present in b, I want "NR" to appear in a. Second, where "NR" is not present in b and a is blank, I would like some other string to appear in such as "DKT" to get something like the following:
cow<-data.frame(a=c("NR","NR","DKT","DKT","DKT","Type1"),b=c("T1 NR","T2 NR","T2","T3","T3","Type1"))
print(cow)
a b
NR T1 NR
NR T2 NR
DKT T2
DKT T3
DKT T3
Type1 Type1
Thanks!
You can assign using row/column subsetting with a data.frame.
# You're doing string editing, so let's avoid factors from the start
pig <- data.frame(
a = c("","","","","","Type1"),
b = c("T1 NR","T2 NR","T2","T3","T3","Type1"),
stringsAsFactors = FALSE
)
# The actual solution
has_nr <- grepl("NR", pig$b)
pig[has_nr, "a"] <- "NR"
pig[!has_nr & pig$a == "", "a"] <- "DKT"

How to optimize and faster for loop when handling large dataset in R?

Currently, I'm working on the data transformation. The data is not super large, about 190k rows.
I wrote a for loop like this:
for (i in 1:nrow(df2)){
#a
record.a <- df[which(df$first_lat==df2[i,"third_lat"]
& df$first_lon==df2[i,"third_lon"]
& df$sixth_lat==df2[i,"fourth_lat"]
& df$sixth_lon==df2[i,"fourth_lon"]
& df[,4]==df2[i,4]
& df[,3]==df2[i,5]),]
df2[i,18] <- ifelse(nrow(record.a) != 0,record.a$order_cnt,NA)
#b
record.b <- df[which(df$fifth_lat==df2[i,"third_lat"]
& df$fifth_lon==df2[i,"third_lon"]
& df$sixth_lat==df2[i,"second_lat"]
& df$sixth_lon==df2[i,"second_lon"]
& df[,4]==df2[i,4]
& df[,3]==df2[i,5]),]
df2[i,19] <- ifelse(nrow(record.b) != 0,record.b$order_cnt,NA)
#c
record.c <- df[which(df$fifth_lat==df2[i,"first_lat"]
& df$fifth_lon==df2[i,"first_lon"]
& df$fourth_lat==df2[i,"second_lat"]
& df$fourth_lon==df2[i,"second_lon"]
& df[,4]==df2[i,4]
& df[,3]==df2[i,5]),]
df2[i,20] <- ifelse(nrow(record.c) != 0,record.c$order_cnt,NA)
#d
record.d <- df[which(df$third_lat==df2[i,"first_lat"]
& df$third_lon==df2[i,"first_lon"]
& df$fourth_lat==df2[i,"sixth_lat"]
& df$fourth_lon==df2[i,"sixth_lon"]
& df[,4]==df2[i,4]
& df[,3]==df2[i,5]),]
df2[i,21] <- ifelse(nrow(record.d) != 0,record.d$order_cnt,NA)
#e
record.e <- df[which(df$third_lat==df2[i,"fifth_lat"]
& df$third_lon==df2[i,"fifth_lon"]
& df$second_lat==df2[i,"sixth_lat"]
& df$second_lon==df2[i,"sixth_lon"]
& df[,4]==df2[i,4]
& df[,3]==df2[i,5]),]
df2[i,22] <- ifelse(nrow(record.e) != 0,record.e$order_cnt,NA)
#f
record.f <- df[which(df$first_lat==df2[i,"fifth_lat"]
& df$first_lon==df2[i,"fifth_lon"]
& df$second_lat==df2[i,"fourth_lat"]
& df$second_lon==df2[i,"fourth_lon"]
& df[,4]==df2[i,4]
& df[,3]==df2[i,5]),]
df2[i,23] <- ifelse(nrow(record.f) != 0,record.f$order_cnt,NA)
}
So, basically, I need to fill out 6 columns of df2 respectively from df with 6 criteria. In the for loop, nrow(df2) is about 190k. It runs super slow. But I used view(df2) to check it and it runs fine. So is there any method I could make it faster? I may apply the same data transformation to a much larger dataset in the future.
df:
df
df2:
df2
The data is about grids on a map. df2 is basically a subset of df but add 6 additional columns. Both df and df2 has the same lon and lat information.
Each grid_id stands for a hexagon area in a map. Each hexagon is connected to other six hexagons by two pairs of lon and lat. What I want to do is that find a particular values from the six surrounding hexagons (in df) to fill into columns (a, b, c, d, e, f) in df2. Also, I need two other conditions, which is hours, ten_mins_interval. (df[,4]==df2[i,4] & df[,3]==df2[i,5]))
So I think the logic is:
For each grid_id, hours, ten_mins_interval (1 row) in df2
find the corresponding 6 grid_ids (6 rows) with same hours, ten_mins_interval in df
fill order_cnt from those 6 rows into a,b,c,d,e,f columns in df2
If you start with your current df2[,1:17] you can add df[,18] with the merge command:
df2 <- merge(df[,c("first_lat","first_lon","sixth_lat","sixth_lon","col4name","col5name","order_cn")],
df2,
by.x=c("first_lat","first_lon","sixth_lat","sixth_lon","col4name","col5name"),
by.y=c("third_lat","third_lon","fourth_lat","fourth_lon","col4name","col3name"),
all.y=TRUE)
You'll need to replace col4name with the name of the fourth column and so on - I can't see from the screenshot what that might be. Five more versions of this command can be easily generated to add the other five columns. As the operation works on whole vectors at time, it's likely to be faster than looping. As data wasn't provided in a suitable format this isn't tested.

Selecting random row from a data.frame and assigning it to one of the two other data.frames based on three conditions in R

I have a data.frame (a) as mentioned below:
V1 V2
1 a b
2 a e
3 a f
4 b c
5 b e
6 b f
7 c d
8 c g
9 c h
10 d g
11 d h
12 e f
13 f g
14 g h
Lets assume each row represents an edge of a graph and the values of the rows are vertices.
What I want is to pick a random row (which is an edge) from data.frame (a) and assign it to data.frame (b) or data.frame (c) based on the three conditions below. Just to clarify that data.frame (b and c) are empty in the beginning. So the conditions are:
When a row(edge) is randomly picked from data.frame (a) and if neither vertex has been assigned, then assign the edge to the data.frame with least number of rows.
To clarify this condition:
Lets say I pick a random row(edge)#2 from data.frame (a) which has two vertices "a" and "e". So I should check if data.frame (b) and data.frame (c) have either "a" or "e" present in any of their rows. So if they have "a" or "e" present then this rule should not be implemented and next rule should be checked. If both data.frames do not have "a" or "e" present in any of the rows then nrow(number of rows) should be checked in both data.frames and the one with lower number of nrow() should be assigned that row. If both have same nrow() then any of the two data.frame could be assigned that row.
When a row(edge) is randomly picked from data.frame (a) and if one of the vertices of that row is present in any of the data.frames (b) or (c) then assign the row(edge) to that data.frame
If a random row is picked say for example #3 which has "a" and "f". Then data.frames b and c should be checked to see if any of the rows contain either "a" or "f". Suppose data.frame (b) does not contain either "a" or "f" but data.frame (c) contains "f". So the row should be assigned to data.frame (c).
Now there is also a possibility that data.frame (b) contains "a" and data.frame(c) contains "f". In that case, all the instances of "a" in data.frame (b) and "f" in data.frame (c) should be counted. If "a" appears 3 times and "f" appears 4 times then the row should be assigned to (b) i.e The row then should be assigned to the data.frame which has lower number of instances of the vertex present in that data.frame.
When a row(edge) is randomly picked from data.frame (a) and if both the vertices of that row are present in a data.frame then assign the row to that data.frame
So to summarize, a random row should be picked from data.frame(a) and check for the above mentioned conditions and should be assigned to data.frame(b) or (c) after going through the conditions above. So all the rows of data.frame(a) have to be checked for the conditions.
This should get you started. You can't continually randomly select rows, as you discovered, as that leads to duplicates. Instead, randomly assign the rows to a vector which gives the order they should be processed in. If you don't think this is the right approach, you could also randomly select a row, then remove it from a and later randomly select from what remains. If you still need a, remove the row from a copy of a.
set.seed(1)
dfa <- data.frame(V1 = sample(letters[1:9], replace = TRUE), V2 = sample(letters[1:9], replace = TRUE))
todo <- sample(1:nrow(dfa), nrow(dfa), replace = FALSE)
dfb <- dfa[todo[1],]
dfc <- dfa[todo[2],]
Now continue through 'todo' in order, applying your conditions
and using rbind to add rows to the dfb and dfc:
for (i in 3:length(todo)) {
# apply your logic
# if a row belongs in dfb, do
dfb <- rbind(dfb, dfa[todo[i],])
# etc
}
aCopy<-read.table("isnodes.txt")
p1<-aCopy[-c(1:nrow(aCopy)),]
p2<-aCopy[-c(1:nrow(aCopy)),]
currentRowHistory<-aCopy[-c(1:nrow(aCopy)),]
for(i in 1:nrow(a)) {
currentRow <- aCopy[sample(nrow(aCopy), 1), ]
currentRowHistory <- rbind(currentRow,currentRowHistory)
currentRowV1 <- as.character(currentRow$V1[1])
currentRowV2 <- as.character(currentRow$V2[1])
aCopy <- aCopy[!(aCopy$V1 == currentRowV1 & aCopy$V2 == currentRowV2),]
if(length(which(currentRowV1 == p1$V1)) | length(which(currentRowV1 == p1$V2))){
if(length(which(currentRowV2 == p1$V1)) | length(which(currentRowV2 == p1$V2))){
p1<-rbind(currentRow,p1)
result <- "case 1 assign it to p1"
}
else if(length(which(currentRowV2 == p2$V1)) | length(which(currentRowV2 == p2$V2))){
V1occurances <- length(which(p1$V1 == currentRowV1))+length(which(p1$V2==currentRowV1))
V2occurances <- length(which(p2$V1 == currentRowV2))+length(which(p2$V2==currentRowV2))
ifelse(V1occurances<V2occurances,p1<-rbind(currentRow,p1),p2<-rbind(currentRow,p2))
result <- "case 2"
}
else {
p1<-rbind(currentRow,p1)
result <- "case 3 assign it to p1"
}
} else if(length(which(currentRowV1 == p2$V1)) | length(which(currentRowV1 == p2$V2))){
if(length(which(currentRowV2 == p2$V1)) | length(which(currentRowV2 == p2$V2))){
p2<-rbind(currentRow,p2)
result <- "case 1 assign it to p2"
}
else if(length(which(currentRowV2 == p1$V1)) | length(which(currentRowV2 == p1$V2))){
V1occurancesInP2 <- length(which(p2$V1 == currentRowV1))+length(which(p2$V2==currentRowV1))
V2occurancesInP1 <- length(which(p1$V1 == currentRowV2))+length(which(p1$V2==currentRowV2))
ifelse(V1occurancesInP2<V2occurancesInP1,p2<-rbind(currentRow,p2),p1<-rbind(currentRow,p1))
result <- "case 2"
}
else {
p2<-rbind(currentRow,p2)
result <- "case 3 assign it to p2"
}
} else if(length(which(currentRowV2 == p1$V1)) | length(which(currentRowV2 == p1$V2))){
p1<-rbind(currentRow,p1)
result <- "Assign it to p1 case 3"
} else if(length(which(currentRowV2 == p2$V1)) | length(which(currentRowV2 == p2$V2))){
p2<-rbind(currentRow,p2)
result <- "Assign it to p2 case 3"
} else {
ifelse(nrow(p1)<nrow(p2),p1<-rbind(currentRow,p1), p2<-rbind(currentRow,p2))
}
}

Resources