I have a dataframe (df) like this:
Rif dd A A A A A B B B B B C C C C C
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8
c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7
I have a vector
rif <- c(0, 15, 50, 90, 110)
I would like to add to the df a column such that if dd(i) >= rif(i-1) & dd(i)
Rif dd A A A A A B B B B B C C C C C V1
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1
c1 100 1 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8
The same should be done for V2 and V3 with respect to Bs and Cs columns.
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd >= ref[i-1] && df$dd< ref[i]) {
df[,"V1"] <- df[j,i]
}
}
}
I get the following error:
Error in if (..) :
missing value where TRUE/FALSE needed
Probably the if command is not the correct one.
could you help me?
I think you just need to better specify the rows and columns:
df <- data.frame(
c("a1","b1","c1","d1")
, c(10,20,100,70), c(5,12,11,4), c(8,7,6,3), c(10,1,8,7), c(2,5,1,8), c(6,9,14,11)
, c(9,10,1,19), c(6,5,11,2), c(5,3,9,6), c(7,8,3,7), c(9,7,6,1)
, c(1,3,10,20), c(5,6,8,18), c(6,1,13,7), c(4,9,8,10), c(5,8,4,7)
)
colnames(df) <- c("Rif", "dd", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C")
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd[j] >= ref[i-1] && df$dd[j] < ref[i]) {
df$V1[j] <- df[j,i+2]
df$V2[j] <- df[j,i+2+5]
df$V3[j] <- df[j,i+2+10]
}
}
}
which gives:
Rif dd A A A A A B B B B B C C C C C V1 V2 V3
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
Another option in base R:
lters <- c(A="A", B="B", C="C")
firstcol <- lapply(lters, function(x) match(x, colnames(DF)))
idx <- findInterval(DF$dd, rif)
for (l in lters)
DF[, paste0("V_", l)] <- as.integer(DF[cbind(seq_len(nrow(DF)), idx + firstcol[[l]])])
DF
output:
Rif dd A A.1 A.2 A.3 A.4 B B.1 B.2 B.3 B.4 C C.1 C.2 C.3 C.4 V_A V_B V_C
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
data:
DF <- structure(list(Rif = c("a1", "b1", "c1", "d1"), dd = c(10L, 20L,
100L, 70L), A = c(5L, 12L, 11L, 4L), A = c(8L, 7L, 6L, 3L), A = c(10L,
1L, 8L, 7L), A = c(2L, 5L, 1L, 8L), A = c(6L, 9L, 14L, 11L),
B = c(9L, 10L, 1L, 19L), B = c(6L, 5L, 11L, 2L), B = c(5L,
3L, 9L, 6L), B = c(7L, 8L, 3L, 7L), B = c(9L, 7L, 6L, 1L),
C = c(1L, 3L, 10L, 20L), C = c(5L, 6L, 8L, 18L), C = c(6L,
1L, 13L, 7L), C = c(4L, 9L, 8L, 10L), C = c(5L, 8L, 4L, 7L
)), class = "data.frame", row.names = c(NA, -4L))
rif <- c(0, 15, 50, 90, 110)
Another way is reorganize the data by separating the lookup values into another table and perform an update join using data.table:
library(data.table)
setDT(DF)
out <- DF[, .(rn=.I, Rif, dd)]
#reorganizing data
lc <- grepl("A|B|C", names(DF))
lutbl <- data.table(COL=names(DF)[lc], transpose(DF[, ..lc]))
lutbl <- melt(lutbl, measure.vars=patterns("V"), variable.name="rn")[,
c("rn", "rif") := .(as.integer(gsub("V", "", rn)), rep(rif, sum(lc)*nrow(DF)/length(rif)))]
#lookup and update
for (l in lters)
out[, paste0("NEW", l) := lutbl[COL==l][out, on=c("rn", "rif"="dd"), roll=-Inf, value]]
out:
rn Rif dd NEWA NEWB NEWC
1: 1 a1 10 8 6 5
2: 2 b1 20 1 3 1
3: 3 c1 100 14 6 4
4: 4 d1 70 8 7 10
Related
The code below is generating the mode value considering the columns Method1, Method2, Method3 and Method4. However, notice that for alternative 10 and 12 it has the same mode value, that is, it has a value of 2. However, I would like my Mode column to have different values, as if it were a rank. Therefore, the alternative that had Mode=1 is the best, but I have no way of knowing the second best alternative, because it has two numbers 2 in the Mode column. Do you have suggestions on what approach I can take?
database<-structure(list(Alternatives = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
Method1 = c(1L, 10L, 7L, 8L, 9L, 6L, 5L, 3L, 4L, 2L), Method2 = c(1L,
8L, 6L, 7L, 10L, 9L, 4L, 2L, 3L, 5L), Method3 = c(1L,
10L, 7L, 8L, 9L, 6L, 4L, 2L, 3L, 5L), Method4 = c(1L,
9L, 6L, 7L, 10L, 8L, 5L, 3L, 4L, 2L)), class = "data.frame", row.names = c(NA,
10L))
ModeFunc <- function(Vec) {
tmp <- sort(table(Vec),decreasing = TRUE)
Nms <- names(tmp)
if(max(tmp) > 1) {
as.numeric(Nms[1])
} else NA}
output <- database |> rowwise() |>
mutate(Mode = ModeFunc(c_across(Method1:Method4))) %>%
data.frame()
> output
Alternatives Method1 Method2 Method3 Method4 Mode
1 3 1 1 1 1 1
2 4 10 8 10 9 10
3 5 7 6 7 6 6
4 6 8 7 8 7 7
5 7 9 10 9 10 9
6 8 6 9 6 8 6
7 9 5 4 4 5 4
8 10 3 2 2 3 2
9 11 4 3 3 4 3
10 12 2 5 5 2 2
CHECK
output$Rank <- (nrow(output) + 1) - rank(-output$Mode, ties.method = "last")
output|>
arrange(Mode)
Alternatives Method1 Method2 Method3 Method4 Mode Rank
1 3 1 1 1 1 1 1
2 10 3 2 2 3 2 2
3 12 2 5 5 2 2 3
4 11 4 3 3 4 3 4
5 9 5 4 4 5 4 5
6 5 7 6 7 6 6 6
7 8 6 9 6 8 6 7
8 6 8 7 8 7 7 8
9 7 9 10 9 10 9 9
10 4 10 8 10 9 10 10
OK. Based on OP's comment above, Here's a solution that picks the row with the lowest value of Alternatives in case of ties. You can generalise to any other tie break with an appropriate modification of the second mutate.
output |>
arrange(Mode) |> # Sort by mode
group_by(Mode) |> # Assign intial ranks
mutate(Rank=cur_group_id()) |>
arrange(Rank, Alternatives) |> # Sort and assign tie break
mutate(TieBreak=row_number()) |>
ungroup()
# A tibble: 10 × 8
Alternatives Method1 Method2 Method3 Method4 Mode Rank TieBreak
<dbl> <int> <int> <int> <int> <dbl> <int> <int>
1 3 1 1 1 1 1 1 1
2 10 3 2 2 3 2 2 1
3 12 2 5 5 2 2 2 2
4 11 4 3 3 4 3 3 1
5 9 5 4 4 5 4 4 1
6 5 7 6 7 6 6 5 1
7 8 6 9 6 8 6 5 2
8 6 8 7 8 7 7 6 1
9 7 9 10 9 10 9 7 1
10 4 10 8 10 9 10 8 1
Note that cur_group_id() required dplyr v1.0.0 or later and that row_number() takes account of groups when a data frame is grouped.
The code below generates the mode value from the values obtained by Methods 1, 2, 3 and 4. But notice that in some cases I have correct mode values, for example, alternatives 3 and 4, but incorrect ones, such as in alternative 5, as it has two values of 7 and two values of 6, but the mode value is showing 6. Furthermore, in alternatives 11 and 12, it has no a mode value, because it has different values for both methods. So for these incorrect cases, that is, when I have 2 equal values for the same alternative and when I have no mode value, I would like to consider the value obtained by Method 1 to be the mode value. I inserted below the correct output.
Executable code below:
database<-structure(list(Alternatives = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
Method1 = c(1L, 10L, 7L, 8L, 9L, 6L, 5L, 3L, 4L, 2L), Method2 = c(1L,
8L, 6L, 7L, 10L, 9L, 4L, 2L, 5L, 3L), Method3 = c(1L,
10L, 7L, 8L, 9L, 6L, 4L, 2L, 3L, 5L), Method4 = c(1L,
9L, 6L, 7L, 10L, 8L, 5L, 3L, 2L, 4L)), class = "data.frame", row.names = c(NA,
10L))
ModeFunc <- function(Vec) {
tmp <- sort(table(Vec),decreasing = TRUE)
Nms <- names(tmp)
if(max(tmp) > 1) {
as.numeric(Nms[1])
} else NA}
output <- database |> rowwise() |>
mutate(Mode = ModeFunc(c_across(Method1:Method4))) %>%
data.frame()
> output
Alternatives Method1 Method2 Method3 Method4 Mode
1 3 1 1 1 1 1
2 4 10 8 10 9 10
3 5 7 6 7 6 6
4 6 8 7 8 7 7
5 7 9 10 9 10 9
6 8 6 9 6 8 6
7 9 5 4 4 5 4
8 10 3 2 2 3 2
9 11 4 5 3 2 NA
10 12 2 3 5 4 NA
The correct output would then be:
Alternatives Method1 Method2 Method3 Method4 Mode
3 1 1 1 1 1
4 10 8 10 9 10
5 7 6 7 6 7
6 8 7 8 7 8
7 9 10 9 10 9
8 6 9 6 8 6
9 5 4 4 5 5
10 3 2 2 3 3
11 4 5 3 2 4
12 2 3 5 4 2
You could use some conventional mode() function,
mode <- function(x) {
ux <- unique(x)
tb <- tabulate(match(x, ux))
ux[tb == max(tb)]
}
and update values using ifelse in mapply.
mds <- apply(database[-1], 1, mode) |> setNames(database$Alternatives)
mapply(\(x, y) ifelse(length(x) > 1, y, x), mds, database$Method1)
# 3 4 5 6 7 8 9 10 11 12
# 1 10 7 8 9 6 5 3 4 2
So, altogether it could look like this:
database |>
cbind(Mode=mapply(\(x, y) ifelse(length(x) > 1, y, x),
apply(database[-1], 1, mode),
database$Method1))
# Alternatives Method1 Method2 Method3 Method4 Mode
# 1 3 1 1 1 1 1
# 2 4 10 8 10 9 10
# 3 5 7 6 7 6 7
# 4 6 8 7 8 7 8
# 5 7 9 10 9 10 9
# 6 8 6 9 6 8 6
# 7 9 5 4 4 5 5
# 8 10 3 2 2 3 3
# 9 11 4 5 3 2 4
# 10 12 2 3 5 4 2
I have two data-frames
The first data
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
The second data
meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
I want to combine the two data-frames and keep the colnames of the first dataset
so the result i want is :
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
I tried : the following function
data_all <- rbind(df1,df2)
but it didn't work
I also tried the function bind_rows from dplyr package but this one create new columns.
Thank you
You could always do:
colnames(df2) <- colnames(df1)
data_all <- rbind(df1, df2)
data_all
A possible solution:
library(tidyverse)
df1 <- read.table(text = " col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
", header=T)
df2 <- read.table(text = "meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
", header=T)
df2 %>% rename_with(~ str_remove(.x, "mean")) %>%
bind_rows(df1, .)
#> col1 col2 col3
#> A1 4 11 15
#> A2 2 9 17
#> A3 3 4 4
#> B1 10 5 4
#> B2 6 1 8
#> C1 12 1 12
#> C2 2 5 8
#> D1 4 1 6
#> D2 2 1 8
#> meanA 3 8 12
#> meanB 8 3 6
#> meanC 7 3 10
#> meanD 3 1 7
You could do:
rbind(df1, setNames(df2, names(df1)))
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
Use mapply:
data.frame(mapply(c, df1, df2))
output
id col1 col2 col3
1 A1 4 11 15
2 A2 2 9 17
3 A3 3 4 4
4 B1 10 5 4
5 B2 6 1 8
6 C1 12 1 12
7 C2 2 5 8
8 D1 4 1 6
9 D2 2 1 8
10 meanA 3 8 12
11 meanB 8 3 6
12 meanC 7 3 10
13 meanD 3 1 7
data
df1 <- read.table(text = "id col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
", header=T)
df2 <- read.table(text = "id meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
", header=T)
Another option is to use rbindlist:
library(data.table)
rbindlist(lapply(list(df1, df2), setDT, keep.rownames = TRUE))
# Or using `add_rownames` from `dplyr`
# rbindlist(lapply(list(df1, df2), add_rownames), use.names = F)
Output
rn col1 col2 col3
1: A1 4 11 15
2: A2 2 9 17
3: A3 3 4 4
4: B1 10 5 4
5: B2 6 1 8
6: C1 12 1 12
7: C2 2 5 8
8: D1 4 1 6
9: D2 2 1 8
10: meanA 3 8 12
11: meanB 8 3 6
12: meanC 7 3 10
13: meanD 3 1 7
Data
df1 <- structure(list(col1 = c(4L, 2L, 3L, 10L, 6L, 12L, 2L, 4L, 2L),
col2 = c(11L, 9L, 4L, 5L, 1L, 1L, 5L, 1L, 1L), col3 = c(15L,
17L, 4L, 4L, 8L, 12L, 8L, 6L, 8L)), class = "data.frame", row.names = c("A1",
"A2", "A3", "B1", "B2", "C1", "C2", "D1", "D2"))
df2 <- structure(list(meancol1 = c(3L, 8L, 7L, 3L), meancol2 = c(8L,
3L, 3L, 1L), meancol3 = c(12L, 6L, 10L, 7L)), class = "data.frame", row.names = c("meanA",
"meanB", "meanC", "meanD"))
I have data frame like this
class col2 col3 col4 col5 col6
A AA 0 5 4 2 15
B AA 4 10 14 12 25
C AA 19 2 8 5 3
D SS 17 5 5 32 12
E AA 14 2 12 14 55
F II 12 17 1 9 0
G SS 10 37 8 2 17
H II 17 7 5 7 14
I want to remove all columns that have zero values
class col3 col4 col5
A AA 5 4 2
B AA 10 14 12
C AA 2 8 5
D SS 5 5 32
E AA 2 12 14
F II 17 1 9
G SS 37 8 2
H II 7 5 7
So the result I want is just want those columns which do not contain any zeros
Thank you
Based on your description I assume you want to remove rows with zero values, not columns. Here's how you can do it with dplyr:
library(dplyr)
filter(df, across(everything(), ~.!=0))
#> # A tibble: 4 x 6
#> class col2 col3 col4 col5 col6
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 AA 4 10 14 12 25
#> 2 AA 19 2 8 5 3
#> 3 AA 14 2 12 14 55
#> 4 SS 10 37 8 2 17
A possible solution:
df[apply(df == 0, 2, sum) == 0]
#> class col3 col4 col5
#> A AA 5 4 2
#> B AA 10 14 12
#> C AA 2 8 5
#> D SS 5 5 32
#> E AA 2 12 14
#> F II 17 1 9
#> G SS 37 8 2
#> H II 7 5 7
With the new dataset:
base R:
In base R we can use Filter and negate any:
Filter(function(x) !any(x %in% 0), df)
class col3 col4 col5
A AA 5 4 2
B AA 10 14 12
C AA 2 8 5
D SS 5 5 32
E AA 2 12 14
F II 17 1 9
G SS 37 8 2
H II 7 5 7
One base R option could be:
df_so[,!sapply(df_so, function(x) any(x == 0))]
# class col3 col4 col5
#A AA 5 4 2
#B AA 10 14 12
#C AA 2 8 5
#D SS 5 5 32
#E AA 2 12 14
#F II 17 1 9
#G SS 37 8 2
#H II 7 5 7
Not my answer, but #user2974951 provided a very fast and straightforward answer as a comment in the Original Post:
df[,colSums(df==0)==0]
Here is another option using a combination of select and where:
library(tidyverse)
df %>%
select(where(~!any(. == 0)))
Output
class col3 col4 col5
A AA 5 4 2
B AA 10 14 12
C AA 2 8 5
D SS 5 5 32
E AA 2 12 14
F II 17 1 9
G SS 37 8 2
H II 7 5 7
Before select_if was deprecated, we could have written it like:
df %>%
select_if( ~ !any(. == 0))
Data Table
Here is a possible data.table solution:
library(data.table)
dt <- as.data.table(df)
dt[, .SD, .SDcols = !names(dt)[(colSums(dt == 0) > 0)]]
Data
df <- structure(list(class = c("AA", "AA", "AA", "SS", "AA", "II",
"SS", "II"), col2 = c(0L, 4L, 19L, 17L, 14L, 12L, 10L, 17L),
col3 = c(5L, 10L, 2L, 5L, 2L, 17L, 37L, 7L), col4 = c(4L,
14L, 8L, 5L, 12L, 1L, 8L, 5L), col5 = c(2L, 12L, 5L, 32L,
14L, 9L, 2L, 7L), col6 = c(15L, 25L, 3L, 12L, 55L, 0L, 17L,
14L)), class = "data.frame", row.names = c("A", "B", "C",
"D", "E", "F", "G", "H"))
I am trying to sum 3 values within a data frame by group.
For example:
Name Data
A 3
A 5
A 9
A 4
A 3
A 5
A 9
A 4
A 3
A 5
A 9
A 4
B 1
B 5
B 1
B 9
Here is what I want:
Name Data
A 17
A 18
A 16
A 12
A 17
A 18
A 17
A 12
A 17
A 18
A N/A
A N/A
B 7
B 15
B N/A
B N/A
I am trying to do this iwth dplyr, is there a better way?
We could use rollsum (from zoo) after grouping by 'Name'
library(dplyr)
library(zoo)
df1 %>%
group_by(Name) %>%
mutate(new = rollsum(Data, 3, fill = NA, align = 'left')) %>%
ungroup
-ouput
# A tibble: 16 x 3
Name Data new
<chr> <int> <int>
1 A 3 17
2 A 5 18
3 A 9 16
4 A 4 12
5 A 3 17
6 A 5 18
7 A 9 16
8 A 4 12
9 A 3 17
10 A 5 18
11 A 9 NA
12 A 4 NA
13 B 1 7
14 B 5 15
15 B 1 NA
16 B 9 NA
Or using frollsum from data.table
library(data.table)
setDT(df1)[, new := frollsum(Data, 3, align = 'left'), by = Name]
df1
Name Data new
1: A 3 17
2: A 5 18
3: A 9 16
4: A 4 12
5: A 3 17
6: A 5 18
7: A 9 16
8: A 4 12
9: A 3 17
10: A 5 18
11: A 9 NA
12: A 4 NA
13: B 1 7
14: B 5 15
15: B 1 NA
16: B 9 NA
data
df1 <- structure(list(Name = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "B", "B", "B", "B"), Data = c(3L, 5L, 9L,
4L, 3L, 5L, 9L, 4L, 3L, 5L, 9L, 4L, 1L, 5L, 1L, 9L)),
class = "data.frame", row.names = c(NA,
-16L))