R adding columns and data - r

I have a table with two columns A and B. I want to create a new table with two new columns added: X and Y. These two new columns are to contain data from column A, but every second row from column A. Correspondingly for column X, starting from the first value in column A and from the second value in column A for column Y.
So far, I have been doing it in Excel. But now I need it in R best function form so that I can easily reuse that code. I haven't done this in R yet, so I am asking for help.
Example data:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)), class = "data.frame", row.names = c(NA,
-10L))
Sample result:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L), X = c(2L,
NA, 5L, NA, 54L, NA, 34L, NA, 10L, NA), Y = c(NA, 7L, NA, 11L,
NA, 12L, NA, 14L, NA, 6L)), class = "data.frame", row.names = c(NA,
-10L))

It is not a super elegant solution, but it works:
exampleDF <- structure(list(A = c(2L, 7L, 5L, 11L, 54L,
12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L,
32L, 19L, 24L, 44L, 37L)),
class = "data.frame", row.names = c(NA, -10L))
index <- seq(from = 1, to = nrow(exampleDF), by = 2)
exampleDF$X <- NA
exampleDF$X[index] <- exampleDF$A[index]
exampleDF$Y <- exampleDF$A
exampleDF$Y[index] <- NA

You could also make use of the row numbers and the modulo operator:
A simple ifelse way:
library(dplyr)
df |>
mutate(X = ifelse(row_number() %% 2 == 1, A, NA),
Y = ifelse(row_number() %% 2 == 0, A, NA))
Or using pivoting:
library(dplyr)
library(tidyr)
df |>
mutate(name = ifelse(row_number() %% 2 == 1, "X", "Y"),
value = A) |>
pivot_wider()
A function using the first approach could look like:
See comment
xy_fun <- function(data, A = A, X = X, Y = Y) {
data |>
mutate({{X}} := ifelse(row_number() %% 2 == 1, {{A}}, NA),
{{Y}} := ifelse(row_number() %% 2 == 0, {{A}}, NA))
}
xy_fun(df, # Your data
A, # The col to take values from
X, # The column name of the first new column
Y # The column name of the second new column
)
Output:
A B X Y
1 2 3 2 NA
2 7 5 NA 7
3 5 1 5 NA
4 11 21 NA 11
5 54 67 54 NA
6 12 32 NA 12
7 34 19 34 NA
8 14 24 NA 14
9 10 44 10 NA
10 6 37 NA 6
Data stored as df:
df <- structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)
),
class = "data.frame",
row.names = c(NA, -10L)
)

I like the #harre approach:
Another approach with base R we could ->
Use R's recycling ability (of a shorter-vector to a longer-vector):
df$X <- df$A
df$Y <- df$B
df$X[c(FALSE, TRUE)] <- NA
df$Y[c(TRUE, FALSE)] <- NA
df
A B X Y
1 2 3 2 NA
2 7 5 NA 5
3 5 1 5 NA
4 11 21 NA 21
5 54 67 54 NA
6 12 32 NA 32
7 34 19 34 NA
8 14 24 NA 24
9 10 44 10 NA
10 6 37 NA 37

Related

How to select specific element from nested dataframes

I have a list of nested data frames and I want to extract the observations of the earliest year, my problem is the first year change with the data frames. the year is either 1992 or 2005.
I want to create a list to stock them, I tried with which, but since there is the same year, observations are repeated, and I want them apart
new_df<- which(df[[i]]==1992 | df[[i]]==2005)
I've tried with ifelse() but I have to do an lm operation after, and it doesn't work. And I can't take only the first rows, because the year are repeated
my code looks like this:
df<- list(a<-data.frame(a_1<-(1992:2015),
a_2<-sample(1:24)),
b<-data.frame(b_1<-(1992:2015),
b_2<-sample(1:24)),
c<-data.frame(c_1<-(2005:2015),
c_2<-sample(1:11)),
d<-data.frame(d_1<-(2005:2015),
d_2<-sample(1:11)))
You can define a function to get the data on one data.frame and loop on the list to extract values.
Below I use map from the purrr package but you can also use lapply and for loops
Please do not use <- when assigning values in a function call (here data.frame() ) because it will mess colnames. = is used in function calls for arguments variables and it's okay to use it. You can read this ;)
df<- list(a<-data.frame(a_1 = (1992:2015),
a_2 = sample(1:24)),
b<-data.frame(b_1 = (1992:2015),
b_2 = sample(1:24)),
c<-data.frame(c_1 = (2005:2015),
c_2 = sample(1:11)),
d<-data.frame(d_1 = (2005:2015),
d_2 = sample(1:11)))
extract_miny <- function(df){
miny <- min(df[,1])
res <- df[df[,1] == miny, 2]
names(res) <- miny
return(res)
}
map(df, extract_miny)
If the data is sorted as the example, you can slice() the first row for the information. Notice the use of = rather than <- in creating a nested dataframe.
library(tidyverse)
df <- list(
a = data.frame(a_1 = (1992:2015),
a_2 = sample(1:24)),
b = data.frame(b_1 = (1992:2015),
b_2 = sample(1:24)),
c = data.frame(c_1 = (2005:2015),
c_2 = sample(1:11)),
d = data.frame(d_1 = (2005:2015),
d_2 = sample(1:11))
)
df %>%
imap_dfr( ~ slice(.x, 1) %>%
set_names(c("year", "value")) %>%
mutate(dataframe = .y) %>%
as_tibble())
# A tibble: 4 x 3
year value dataframe
<int> <int> <chr>
1 1992 19 a
2 1992 2 b
3 2005 1 c
4 2005 5 d
You may subset anonymeously.
lapply(df, \(x) setNames(x[x[[1]] == min(x[[1]]), ], c('year', 'value'))) |> do.call(what=rbind)
# year value
# 1 1992 6
# 2 1992 9
# 3 2005 11
# 4 2005 11
Or maybe better by creating a variable from which sample the value stems from.
Map(`[<-`, df, 'sample', value=letters[seq_along(df)]) |>
lapply(\(x) setNames(x[x[[1]] == min(x[[1]]), ], c('year', 'value', 'sample'))) |>
do.call(what=rbind)
# year value sample
# 1 1992 6 a
# 2 1992 9 b
# 3 2005 11 c
# 4 2005 11 d
Data:
df <- list(structure(list(a_1.....1992.2015. = 1992:2015, a_2....sample.1.24. = c(6L,
18L, 23L, 5L, 7L, 14L, 4L, 10L, 19L, 17L, 15L, 1L, 11L, 22L,
13L, 8L, 20L, 16L, 2L, 3L, 24L, 21L, 9L, 12L)), class = "data.frame", row.names = c(NA,
-24L)), structure(list(b_1.....1992.2015. = 1992:2015, b_2....sample.1.24. = c(9L,
24L, 18L, 8L, 16L, 11L, 13L, 23L, 15L, 20L, 19L, 21L, 12L, 22L,
7L, 3L, 6L, 17L, 2L, 5L, 4L, 10L, 1L, 14L)), class = "data.frame", row.names = c(NA,
-24L)), structure(list(c_1.....2005.2015. = 2005:2015, c_2....sample.1.11. = c(11L,
2L, 5L, 10L, 9L, 6L, 1L, 7L, 3L, 8L, 4L)), class = "data.frame", row.names = c(NA,
-11L)), structure(list(d_1.....2005.2015. = 2005:2015, d_2....sample.1.11. = c(11L,
2L, 5L, 1L, 6L, 9L, 3L, 7L, 10L, 4L, 8L)), class = "data.frame", row.names = c(NA,
-11L)))

R get new table

I have a table with two columns A and B. I want to create a new table with two new columns added: X and Y.
The X column is to contain the values from the A column, but with the division performed. Values from the first row (from column A) divided by the values from the second row in column A and so for all subsequent rows, e.g. the third row divided by the fourth row etc.
The Y column is to contain the values from the B column, but with the division performed. Values from the first row (from column B) divided by the values from the second row in column B and so for all subsequent rows, e.g. the third row divided by the fourth row etc.
So far I used Excel for this. But now I need it in R if possible in the form of a function so that I can reuse this code easily. I haven't done this in R yet, so I am asking for help.
Example data:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)), class = "data.frame", row.names = c(NA,
-10L))
Sample results:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L), X = c("",
"0,285714286", "", "0,454545455", "", "4,5", "", "2,428571429",
"", "1,666666667"), Y = c("", "0,6", "", "0,047619048", "", "2,09375",
"", "0,791666667", "", "1,189189189")), class = "data.frame", row.names = c(NA,
-10L))
You could use dplyr's across and lag (combined with modulo for picking every second row):
library(dplyr)
df |> mutate(across(c(A, B), ~ ifelse(row_number() %% 2 == 0, lag(.) / ., NA), .names = "new_{.col}"))
If you want a character vector change NA to "".
Output:
A B new_A new_B
1 2 3 NA NA
2 7 5 0.2857143 0.60000000
3 5 1 NA NA
4 11 21 0.4545455 0.04761905
5 54 67 NA NA
6 12 32 4.5000000 2.09375000
7 34 19 NA NA
8 14 24 2.4285714 0.79166667
9 10 44 NA NA
10 6 37 1.6666667 1.18918919
Function:
ab_fun <- function(data, vars) {
data |>
mutate(across(c(A, B), ~ ifelse(row_number() %% 2 == 0, lag(.) / ., NA), .names = "new_{.col}"))
}
ab_fun(df, c(A,B))
Updated with new data and correct code. + Function

how to get the differences across several column groups

I have a data like this
df<-structure(list(X1 = c(37L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, NA,
11L, 12L), X2 = c(40L, NA, 35L, 35L, 35L, 34L, NA, 28L, 28L,
NA, 25L, 24L), X3 = c(60L, 44L, 49L, 41L, NA, NA, NA, 25L, 26L,
NA, NA, 22L), T1 = c(19L, 55L, 47L, 46L, 36L, 42L, 25L, NA, 33L,
42L, 50L, 22L), T2 = c(75L, NA, 32L, 44L, 27L, 31L, 17L, NA,
18L, 45L, 10L, 11L), T3 = c(5L, 6L, 7L, 8L, 9L, 10L, 11L, NA,
46L, 36L, 42L, NA), P1 = c(2L, 2L, 3L, 4L, 2L, 6L, 7L, 8L, 9L,
NA, 1L, 12L), P2 = c(40L, 44L, 4L, 2L, 1L, 1L, NA, 1L, 1L, 1L,
5L, 55L), P3 = c(1L, 44L, 49L, 3L, NA, NA, NA, 25L, 26L, NA,
NA, 66L)), class = "data.frame", row.names = c(NA, -12L))
I have three groups and each group has 3 columns , they are called X, T and P.
I am trying to find out how many of rows in each group are overlapped with another group and how many rows in each group is different than another group. ( each row of each group must at lest have 2 values)
so I am looking for an output like this
X 10 rows overlapping with T and 2 different
T has 10 overlapping with X and 2 different
X has 10 overlapping with P and 1 different
T has 10 overlapping with P and 3 different
it means I have 10 rows of X1,X2 and X3 which have at least 2 values and they have values in the group T (T1,T2,T3). There is one row that is completely empty or has only 1 value but they have values in T group.
The same for other combination
This question is still sort of ambiguous and narrow, but here is the general idea for tidying your data to the point where you can easily summarize over different groups and/or rows:
library(tidyverse)
df %>%
as_tibble %>%
rowid_to_column %>%
gather(select=-rowid) %>%
separate(key, into=c('group', 'column'), sep=1) %>%
group_by(group)
Extending along the lines of John Colby's answer, you can summarize how many rows are populated with 2 or more non-NA values in each letter's columns:
library(tidyverse)
df_summarized <- df %>%
rowid_to_column() %>%
gather(colname, value, -rowid) %>%
separate(colname, into = c("letter", "number"), sep = 1) %>%
count(rowid, letter, wt = !is.na(value), name = "num_values") %>%
mutate(populated = num_values >= 2)
> df_summarized
# A tibble: 36 x 4
rowid letter num_values populated
<int> <chr> <int> <lgl>
1 1 P 3 TRUE
2 1 T 3 TRUE
3 1 X 3 TRUE
4 2 P 3 TRUE
5 2 T 2 TRUE
6 2 X 2 TRUE
7 3 P 3 TRUE
8 3 T 3 TRUE
9 3 X 3 TRUE
10 4 P 3 TRUE
# ... with 26 more rows
And then use that to compare between letters. For instance, here I see that 9 rows have the same populated / not-populated status among X and T columns. Three rows (7, 8, and 10) differ in their populated status between those two letters.
> df_summarized %>%
+ select(-num_values) %>%
+ spread(letter, populated)
# A tibble: 12 x 4
rowid P T X
<int> <lgl> <lgl> <lgl>
1 1 TRUE TRUE TRUE
2 2 TRUE TRUE TRUE
3 3 TRUE TRUE TRUE
4 4 TRUE TRUE TRUE
5 5 TRUE TRUE TRUE
6 6 TRUE TRUE TRUE
7 7 FALSE TRUE FALSE # T but no X
8 8 TRUE FALSE TRUE # X but no T
9 9 TRUE TRUE TRUE
10 10 FALSE TRUE FALSE # T but no X
11 11 TRUE TRUE TRUE
12 12 TRUE TRUE TRUE
We could query the data like this to get the overlaps and non-overlaps:
df_summarized %>%
select(-num_values) %>%
spread(letter, populated) %>%
summarize(PT = sum(P==T),
PT_non = sum(P!=T),
TX = sum(T==X),
TX_non = sum(T!=X),
XP = sum(X==P),
XP_non = sum(X!=P))
# A tibble: 1 x 6
PT PT_non TX TX_non XP XP_non
<int> <int> <int> <int> <int> <int>
1 9 3 9 3 12 0

Finding columns that contain values based on another column

I have the following data frame:
Step 1 2 3
1 5 10 6
2 5 11 5
3 5 13 9
4 5 15 10
5 13 18 10
6 15 20 10
7 17 23 10
8 19 25 10
9 21 27 13
10 23 30 7
I would like to retrieve the columns that satisfy one of the following conditions: if step 1 = step 4 or step 4 = step 8. In this case, column 1 and 3 should be retrieved. Column 1 because the value at Step 1 = value at step 4 (i.e., 5), and for column 3, the value at step 4 = value at step 8 (i.e., 10).
I don't know how to do that in R. Can someone help me please?
You can get the column indices by the following code:
df[1, -1] == df[4, -1] | df[4, -1] == df[8, -1]
# X1 X2 X3
# 1 TRUE FALSE TRUE
# data
df <- structure(list(Step = 1:10, X1 = c(5L, 5L, 5L, 5L, 13L, 15L,
17L, 19L, 21L, 23L), X2 = c(10L, 11L, 13L, 15L, 18L, 20L, 23L,
25L, 27L, 30L), X3 = c(6L, 5L, 9L, 10L, 10L, 10L, 10L, 10L, 13L,
7L)), class = "data.frame", row.names = c(NA, -10L))

Calculating top 4 of column 1 by column 2 - R

I'm new in R and to be honest don't know how to call what I'm looking for :)
I have data-set "ds" set with 2 columns:
D | res
==========
Ds 20
Dx 23
Dp 1
Ds 12
Ds 23
Ds 54
Dn 65
Ds 122
Dx 11
Dx 154
Dx 18
Do 4
Df 17
Dp 5
Dp 107
Dp 8
Df 3
Dp 33
Dd 223
Dc 7
Dv 22
Du 34
Dh 22
Ds 12
Dy 78
Dd 128
I need to calculate top 4 from column "D" by "Res" so desired result would look like :
D | Res
========
Dd 351
Dp 154
Ds 243
Dx 206
and by %age:
D | % Of Total
==========
Dd 29.10%
Dp 12.77%
Ds 20.15%
Dx 17.08%
Thanks
We can use aggregate() to obtain the sum of each type of "D", and we can introduce a new column to account for the edit of the OP and include also the percentage.
In order to display the result in the desired form, we can apply the order() function to rearrange the rows according to the value of Res. The function rev() in this case ensures that the highest value is put on top, and head() with the parameter 4 displays the first four rows.
summarized <- aggregate(Res ~. , df1, sum)
summarized$Perc <- with(summarized, paste0(round(Res/sum(Res)*100,2),"%"))
head(summarized[rev(order(summarized$Res)),],4)
D Res Perc
2 Dd 351 29.1%
8 Ds 243 20.15%
11 Dx 206 17.08%
7 Dp 154 12.77%
data
df1 <- structure(list(D = structure(c(8L, 11L, 7L, 8L, 8L, 8L, 5L,
8L, 11L, 11L, 11L, 6L, 3L, 7L, 7L, 7L, 3L, 7L, 2L, 1L, 10L, 9L,
4L, 8L, 12L, 2L), .Label = c("Dc", "Dd", "Df", "Dh", "Dn", "Do",
"Dp", "Ds", "Du", "Dv", "Dx", "Dy"), class = "factor"), Res = c(20L,
23L, 1L, 12L, 23L, 54L, 65L, 122L, 11L, 154L, 18L, 4L, 17L, 5L,
107L, 8L, 3L, 33L, 223L, 7L, 22L, 34L, 22L, 12L, 78L, 128L)),
.Names = c("D", "Res"), class = "data.frame", row.names = c(NA, -26L))
If you mean to sum Res per D and then select the top 4 sums (assuming you made mistakes calculating the sums for ds and dp) you could try:
library(dplyr)
df1 %>% mutate(per = Res/sum(Res)) %>% group_by(D) %>% summarise(Res = sum(Res), perc = sum(per)) %>% top_n(4, Res)
Source: local data frame [4 x 3]
D Res perc
(fctr) (int) (dbl)
1 Dd 351 0.2910448
2 Dp 154 0.1276949
3 Ds 243 0.2014925
4 Dx 206 0.1708126
Option using data.table
library(data.table)
out = setorder(setDT(data)[, .(tmp = sum(res)), by = D]
[, .(D, ptg = (tmp/sum(tmp))*100)], -ptg)[1:4,]
#> out
# D ptg
#1: Dd 29.10448
#2: Ds 20.14925
#3: Dx 17.08126
#4: Dp 12.76949

Resources