Start with the data:
> dput(Data1)
structure(list(X1 = structure(c(17L, 14L, 20L, 16L, 1L, 2L, 3L,
4L, 15L, 8L, 9L, 10L, 11L, 12L, 13L, 21L, 22L, 23L, 18L, 19L,
5L, 6L, 7L), .Label = c("Astra_1", "Astra_2", "Astra_3", "Astra_4",
"Audi_1", "Audi_2", "Audi_3", "BMW_1", "BMW_2", "BMW_3", "BMW_4",
"BMW_5", "Fiat_1", "Mazda_2", "Mercedes_1", "Nexia_1", "Porsche_1",
"Scania_1", "Scania_2", "Tico_1", "VW_1", "VW_2", "VW_3"), class = "factor"),
X2 = structure(c(2L, 3L, 10L, 7L, 8L, 12L, 9L, 14L, 11L,
4L, 5L, 6L, 15L, 13L, 4L, 5L, 9L, 14L, 11L, 1L, 3L, 10L,
16L), .Label = c("Astra_1", "Astra_3", "Astra_4", "Audi_1",
"Audi_2", "Audi_3", "BMW_1", "BMW_2", "Mazda_2", "Mercedes_1",
"Nexia_1", "Porsche_1", "Scania_2", "Tico_1", "VW_2", "VW_3"
), class = "factor"), AUC_1 = c(5860133.702, 1296009.939,
333123.4932, 250348.9407, 1376193.334, 4080502.863, 3777603.233,
3503973.487, 99101538.62, 231873.8462, 87258.75465, 147430.9913,
1028986.892, 1451482.832, 8136.72382, 25311.41683, 131352.7137,
565410.8186, 30196.23792, 70184.82268, 2526321.019, 381643.2138,
819687.9824), AUC_2 = c(4849720.322, 928980.4715, 320547.6185,
223287.2029, 1340641.323, 4720329.699, 4369150.434, 3371021.243,
108591253.3, 266489.7601, 85384.84604, 165726.7626, 1052130.559,
1470876.65, 9499.927679, 49309.74984, 138482.765, 444600.7911,
25132.73714, 55453.67019, 2038911.81, 422559.3293, 1445477.433
), ratio = c(1.20834467, 1.395088463, 1.03923247, 1.121196994,
1.02651866, 0.864452935, 0.864608186, 1.039439753, 0.91261069,
0.87010415, 1.021946618, 0.889602795, 0.978003046, 0.98681479,
0.856503765, 0.513314647, 0.948513078, 1.271726974, 1.201470327,
1.265647926, 1.2390536, 0.90317072, 0.567070757), Country = structure(c(1L,
1L, 2L, 3L, 5L, 1L, 5L, 1L, 4L, 7L, 4L, 7L, 7L, 7L, 6L, 6L,
6L, 6L, 8L, 8L, 6L, 6L, 7L), .Label = c("France", "Germany",
"Italy", "Norway", "Poland", "Spain", "Sweden", "Ukraine"
), class = "factor"), Comp = structure(c(3L, 5L, 16L, 9L,
8L, 9L, 12L, 14L, 4L, 15L, 11L, 14L, 16L, 17L, 10L, 10L,
12L, 13L, 1L, 2L, 5L, 6L, 7L), .Label = c("11,12", "12,13",
"12,13,14", "14,15", "14,15,16", "15,16,17", "16,17,18",
"2,3", "2,3,4", "3,4", "3,4,5", "4,5,6", "5,6", "5,6,7",
"5,6,7,8", "6,7,8", "7,8,9"), class = "factor")), .Names = c("X1",
"X2", "AUC_1", "AUC_2", "ratio", "Country", "Comp"), class = "data.frame", row.names = c(NA,
-23L))
Head of the data look like that:
X1 X2 AUC_1 AUC_2 ratio Country Comp
1 Porsche_1 Astra_3 5860133.7 4849720.3 1.2083447 France 12,13,14
2 Mazda_2 Astra_4 1296009.9 928980.5 1.3950885 France 14,15,16
3 Tico_1 Mercedes_1 333123.5 320547.6 1.0392325 Germany 6,7,8
4 Nexia_1 BMW_1 250348.9 223287.2 1.1211970 Italy 2,3,4
5 Astra_1 BMW_2 1376193.3 1340641.3 1.0265187 Poland 2,3
6 Astra_2 Porsche_1 4080502.9 4720329.7 0.8644529 France 2,3,4
Now we are going to focus on two last columns: Country and Comp. I would like to extract all rows which contains the same country and than compare if any of the numbers in column Comp is the same the strings from X1 and X2 should be stored together - possibly in the separate vectors or in the matrix. It's possible that one row may belong to different "clusters"/"vectors".
Example of desired output. That's just an example and the clustering is completly random. Any method for visualization of the output is acceptable.
Country 1 2 3 4 5 6
1 France Astra_3 Scania_2 Tico_1 NA NA NA
2 Poland Astra_4 Mazda_2 VW_3 Tico_2 NA NA
3 Sweden Mercedes_1 BMW_1 BMW_2 Audi_1 VW_3 NA
4 Norway BMW_1 Astra_1 Scania_2 Audi_3 NA NA
Assuming dat is your data.
library(data.table)
library(stringr)
setDT(dat)
dat[, `:=`(X1 = as.character(X1), X2 = as.character(X2),
Comp = str_split(as.character(Comp), ","))]
dat[, lapply(.SD, unlist), by = 1:nrow(dat)
][, .(X = paste(sort(unique(c(X1, X2))), collapse = ",")), by = .(Country, Comp)
][, .(SharedComp = paste(Comp, collapse = ",")), by = .(Country, X)] -> result
head(result)
Country X SharedComp
1: France Astra_3,Porsche_1 12,13
2: France Astra_3,Astra_4,Mazda_2,Porsche_1 14
3: France Astra_4,Mazda_2 15,16
4: Germany Mercedes_1,Tico_1 6,7,8
5: Italy BMW_1,Nexia_1 2,3,4
6: Poland Astra_1,BMW_2 2,3
If you want output to look more like in your question, it's necessary to do some reshaping.
dcast(result[, .(Country, SharedComp, X = str_split(X, ","))
][, lapply(.SD, unlist), by = 1:nrow(result)
][, i := seq_len(.N), by = nrow],
nrow + Country ~ i, value.var = "X")
nrow Country 1 2 3 4 5 6 7 8
1: 1 France Astra_3 Porsche_1 NA NA NA NA NA NA
2: 2 France Astra_3 Astra_4 Mazda_2 Porsche_1 NA NA NA NA
3: 3 France Astra_4 Mazda_2 NA NA NA NA NA NA
4: 4 Germany Mercedes_1 Tico_1 NA NA NA NA NA NA
5: 5 Italy BMW_1 Nexia_1 NA NA NA NA NA NA
6: 6 Poland Astra_1 BMW_2 NA NA NA NA NA NA
---
11: 11 Sweden Audi_1 Audi_3 BMW_1 BMW_3 NA NA NA NA
12: 12 Sweden Audi_1 Audi_3 BMW_1 BMW_3 BMW_4 VW_2 NA NA
13: 13 Sweden Audi_1 Audi_3 BMW_1 BMW_3 BMW_4 BMW_5 Scania_2 VW_2
---
25: 25 Spain Audi_2 Mercedes_1 NA NA NA NA NA NA
26: 26 Sweden Audi_3 VW_3 NA NA NA NA NA NA
nrow Country 1 2 3 4 5 6 7 8
I suppose what you want is: find all the rows with a given country, say Spain. Then within these rows, take all rows where a certain number occurs in the column Comp, e.g. 4. In these rows then extract the contents of columns X1 and X2 and put them together.
Maybe this code is what you want:
countries <- levels(data[,"Country"])
results <- list()
cn <- 1
for (i in 1:length(countries))
{
# find all row numbers with that country:
idx <- which(data[,"Country"] == countries[i])
# get all numbers which occur for that country:
numbers <- unique(as.numeric(unlist(strsplit(as.character(data[idx,"Comp"]), ","))))
for (j in 1:length(numbers))
{
# split all the numbers in the column "Comp" by ",":
CompList <- strsplit(as.character(data[idx,"Comp"]), ",")
# get all the row numbers for that country where numbers[j] is contained in the column "Comp":
rows <- idx[unlist(lapply(CompList, function(x) {any(x == as.character(numbers[j]))}))]
# assuming you want a number in the column "Comp" to occur at least in two rows:
if (length(rows) > 1)
{
results[[cn]] <- list("Country"= countries[i],
"Cars"= as.vector(as.matrix(data[rows, c("X1", "X2")])),
"ValueOfComp"=numbers[j])
cn <- cn + 1
}
}
}
This gives you something like this:
> results
[[1]]
[[1]]$Country
[1] "France"
[[1]]$Cars
[1] "Porsche_1" "Mazda_2" "Astra_3" "Astra_4"
[[1]]$ValueOfComp
[1] 14
[[2]]
[[2]]$Country
[1] "Spain"
[[2]]$Cars
[1] "Fiat_1" "VW_1" "Audi_1" "Audi_2"
[[2]]$ValueOfComp
[1] 3
Related
I have a list of nested data frames and I want to extract the observations of the earliest year, my problem is the first year change with the data frames. the year is either 1992 or 2005.
I want to create a list to stock them, I tried with which, but since there is the same year, observations are repeated, and I want them apart
new_df<- which(df[[i]]==1992 | df[[i]]==2005)
I've tried with ifelse() but I have to do an lm operation after, and it doesn't work. And I can't take only the first rows, because the year are repeated
my code looks like this:
df<- list(a<-data.frame(a_1<-(1992:2015),
a_2<-sample(1:24)),
b<-data.frame(b_1<-(1992:2015),
b_2<-sample(1:24)),
c<-data.frame(c_1<-(2005:2015),
c_2<-sample(1:11)),
d<-data.frame(d_1<-(2005:2015),
d_2<-sample(1:11)))
You can define a function to get the data on one data.frame and loop on the list to extract values.
Below I use map from the purrr package but you can also use lapply and for loops
Please do not use <- when assigning values in a function call (here data.frame() ) because it will mess colnames. = is used in function calls for arguments variables and it's okay to use it. You can read this ;)
df<- list(a<-data.frame(a_1 = (1992:2015),
a_2 = sample(1:24)),
b<-data.frame(b_1 = (1992:2015),
b_2 = sample(1:24)),
c<-data.frame(c_1 = (2005:2015),
c_2 = sample(1:11)),
d<-data.frame(d_1 = (2005:2015),
d_2 = sample(1:11)))
extract_miny <- function(df){
miny <- min(df[,1])
res <- df[df[,1] == miny, 2]
names(res) <- miny
return(res)
}
map(df, extract_miny)
If the data is sorted as the example, you can slice() the first row for the information. Notice the use of = rather than <- in creating a nested dataframe.
library(tidyverse)
df <- list(
a = data.frame(a_1 = (1992:2015),
a_2 = sample(1:24)),
b = data.frame(b_1 = (1992:2015),
b_2 = sample(1:24)),
c = data.frame(c_1 = (2005:2015),
c_2 = sample(1:11)),
d = data.frame(d_1 = (2005:2015),
d_2 = sample(1:11))
)
df %>%
imap_dfr( ~ slice(.x, 1) %>%
set_names(c("year", "value")) %>%
mutate(dataframe = .y) %>%
as_tibble())
# A tibble: 4 x 3
year value dataframe
<int> <int> <chr>
1 1992 19 a
2 1992 2 b
3 2005 1 c
4 2005 5 d
You may subset anonymeously.
lapply(df, \(x) setNames(x[x[[1]] == min(x[[1]]), ], c('year', 'value'))) |> do.call(what=rbind)
# year value
# 1 1992 6
# 2 1992 9
# 3 2005 11
# 4 2005 11
Or maybe better by creating a variable from which sample the value stems from.
Map(`[<-`, df, 'sample', value=letters[seq_along(df)]) |>
lapply(\(x) setNames(x[x[[1]] == min(x[[1]]), ], c('year', 'value', 'sample'))) |>
do.call(what=rbind)
# year value sample
# 1 1992 6 a
# 2 1992 9 b
# 3 2005 11 c
# 4 2005 11 d
Data:
df <- list(structure(list(a_1.....1992.2015. = 1992:2015, a_2....sample.1.24. = c(6L,
18L, 23L, 5L, 7L, 14L, 4L, 10L, 19L, 17L, 15L, 1L, 11L, 22L,
13L, 8L, 20L, 16L, 2L, 3L, 24L, 21L, 9L, 12L)), class = "data.frame", row.names = c(NA,
-24L)), structure(list(b_1.....1992.2015. = 1992:2015, b_2....sample.1.24. = c(9L,
24L, 18L, 8L, 16L, 11L, 13L, 23L, 15L, 20L, 19L, 21L, 12L, 22L,
7L, 3L, 6L, 17L, 2L, 5L, 4L, 10L, 1L, 14L)), class = "data.frame", row.names = c(NA,
-24L)), structure(list(c_1.....2005.2015. = 2005:2015, c_2....sample.1.11. = c(11L,
2L, 5L, 10L, 9L, 6L, 1L, 7L, 3L, 8L, 4L)), class = "data.frame", row.names = c(NA,
-11L)), structure(list(d_1.....2005.2015. = 2005:2015, d_2....sample.1.11. = c(11L,
2L, 5L, 1L, 6L, 9L, 3L, 7L, 10L, 4L, 8L)), class = "data.frame", row.names = c(NA,
-11L)))
I have a table with two columns A and B. I want to create a new table with two new columns added: X and Y. These two new columns are to contain data from column A, but every second row from column A. Correspondingly for column X, starting from the first value in column A and from the second value in column A for column Y.
So far, I have been doing it in Excel. But now I need it in R best function form so that I can easily reuse that code. I haven't done this in R yet, so I am asking for help.
Example data:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)), class = "data.frame", row.names = c(NA,
-10L))
Sample result:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L), X = c(2L,
NA, 5L, NA, 54L, NA, 34L, NA, 10L, NA), Y = c(NA, 7L, NA, 11L,
NA, 12L, NA, 14L, NA, 6L)), class = "data.frame", row.names = c(NA,
-10L))
It is not a super elegant solution, but it works:
exampleDF <- structure(list(A = c(2L, 7L, 5L, 11L, 54L,
12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L,
32L, 19L, 24L, 44L, 37L)),
class = "data.frame", row.names = c(NA, -10L))
index <- seq(from = 1, to = nrow(exampleDF), by = 2)
exampleDF$X <- NA
exampleDF$X[index] <- exampleDF$A[index]
exampleDF$Y <- exampleDF$A
exampleDF$Y[index] <- NA
You could also make use of the row numbers and the modulo operator:
A simple ifelse way:
library(dplyr)
df |>
mutate(X = ifelse(row_number() %% 2 == 1, A, NA),
Y = ifelse(row_number() %% 2 == 0, A, NA))
Or using pivoting:
library(dplyr)
library(tidyr)
df |>
mutate(name = ifelse(row_number() %% 2 == 1, "X", "Y"),
value = A) |>
pivot_wider()
A function using the first approach could look like:
See comment
xy_fun <- function(data, A = A, X = X, Y = Y) {
data |>
mutate({{X}} := ifelse(row_number() %% 2 == 1, {{A}}, NA),
{{Y}} := ifelse(row_number() %% 2 == 0, {{A}}, NA))
}
xy_fun(df, # Your data
A, # The col to take values from
X, # The column name of the first new column
Y # The column name of the second new column
)
Output:
A B X Y
1 2 3 2 NA
2 7 5 NA 7
3 5 1 5 NA
4 11 21 NA 11
5 54 67 54 NA
6 12 32 NA 12
7 34 19 34 NA
8 14 24 NA 14
9 10 44 10 NA
10 6 37 NA 6
Data stored as df:
df <- structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)
),
class = "data.frame",
row.names = c(NA, -10L)
)
I like the #harre approach:
Another approach with base R we could ->
Use R's recycling ability (of a shorter-vector to a longer-vector):
df$X <- df$A
df$Y <- df$B
df$X[c(FALSE, TRUE)] <- NA
df$Y[c(TRUE, FALSE)] <- NA
df
A B X Y
1 2 3 2 NA
2 7 5 NA 5
3 5 1 5 NA
4 11 21 NA 21
5 54 67 54 NA
6 12 32 NA 32
7 34 19 34 NA
8 14 24 NA 24
9 10 44 10 NA
10 6 37 NA 37
The following code will return the average conditioned that the months are greater than 6.
mean(df[df$delta1>6, "delta1"], na.rm=T)
Now, how do I do apply this for every column in the dataframe?
df:
delta1 delta2 delta3
NA 2 3
4 NA 6
7 8 NA
10 NA 12
NA 14 15
16 NA 18
19 20 NA
The apply-family of functions is useful here:
sapply(df, function(x) mean(x[x>6], na.rm=T))
We can set the values in the dataframe which are less than equal to 6 to NA and count the mean using colMeans ignoring the NA values.
df[df <= 6] <- NA
colMeans(df, na.rm = TRUE)
#delta1 delta2 delta3
# 13 14 15
data
df <- structure(list(delta1 = c(NA, 4L, 7L, 10L, NA, 16L, 19L), delta2 = c(2L,
NA, 8L, NA, 14L, NA, 20L), delta3 = c(3L, 6L, NA, 12L, 15L, 18L,
NA)), class = "data.frame", row.names = c(NA, -7L))
I am trying to add numbers to my data which belongs to each data
my data is like
df <- structure(list(data = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L,
4L, 4L, 5L, 5L, 6L, 5L, 7L, 7L, 8L, 8L, 2L, 2L, 2L), .Label = c("data1",
"data10", "data2", "data3", "data4", "data5", "data6", "data7"
), class = "factor"), values = structure(c(3L, 8L, 18L, 1L, 15L,
17L, 19L, 7L, 2L, 2L, 11L, 10L, 6L, 4L, 9L, 12L, 14L, 5L, 13L,
16L), .Label = c("112864.443", "11319531", "12874.443", "142983324",
"1612410048", "16349475.63", "184901841", "2223793.8", "30553282.01",
"312004.547", "3135868.44", "317403612.9", "3686081.063", "43701608",
"623793.8", "64959501.42", "67666215", "767666215", "775987137.8"
), class = "factor")), .Names = c("data", "values"), class = "data.frame", row.names = c(NA,
-20L))
I want to have the exact values after each of my first column. since they are not consecutive, I dont know how to add them into a separate column. a desire output should look like below
data values
data1 12874.443 1
data1 2223793.8 1
data1 767666215 1
data1 112864.443 1
data1 623793.8 1
data2 67666215 2
data2 775987137.8 2
data3 184901841 3
data3 11319531 3
data4 11319531 4
data4 3135868.44 4
data5 312004.547 5
data4 16349475.63 4
data6 142983324 6
data6 30553282.01 6
data7 317403612.9 7
data7 43701608 7
data10 1612410048 10
data10 3686081.063 10
data10 64959501.42 10
one way is to use gsub to extract the value and add it as another column
df$label <- gsub("[^[:digit:]]", "", df$data)
another way is to use str_extract thanks to this question R: split character data into numbers and letters
library(stringr)
df$label <- as.numeric(str_extract(df$data, "[0-9]+"))
> df
# data values label
# 1 data1 12874.443 1
# 2 data1 2223793.8 1
# 3 data1 767666215 1
# 4 data1 112864.443 1
# 5 data1 623793.8 1
# 6 data2 67666215 2
# 7 data2 775987137.8 2
# 8 data3 184901841 3
# 9 data3 11319531 3
# 10 data4 11319531 4
# 11 data4 3135868.44 4
# 12 data5 312004.547 5
# 13 data4 16349475.63 4
# 14 data6 142983324 6
# 15 data6 30553282.01 6
# 16 data7 317403612.9 7
# 17 data7 43701608 7
# 18 data10 1612410048 10
# 19 data10 3686081.063 10
# 20 data10 64959501.42 10
This question already has answers here:
How to reshape data from long to wide format
(14 answers)
How to sum a variable by group
(18 answers)
Aggregate / summarize multiple variables per group (e.g. sum, mean)
(10 answers)
Closed 5 years ago.
I have a dataset like the one below:
test <- structure(list(SR = c(1L, 1L, 15L, 20L, 20L, 96L, 110L, 110L,
121L, 121L, 130L, 130L, 143L, 143L), Area = structure(c(3L, 3L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 4L, 4L, 2L, 2L), .Label = c("FH",
"MO", "TSC", "WMB"), class = "factor"), Period = structure(c(1L,
2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("First",
"Second"), class = "factor"), count = c(4L, 6L, 3L, 6L, 6L, 3L,
6L, 6L, 6L, 6L, 6L, 6L, 5L, 6L), countTotal = c(10L, 10L, 3L,
12L, 12L, 3L, 12L, 12L, 12L, 12L, 12L, 12L, 11L, 11L), SumTotal = c(1520,
5769.02, 29346.78, 13316.89, 11932.68, 10173.05, 13243.5, 17131.94,
111189.07, 84123.52, 79463.1, 120010.57, 7035.88, 11520)), .Names = c("SR",
"Area", "Period", "count", "countTotal", "SumTotal"), class = "data.frame", row.names = c(NA,
-14L))
SR Area Period count countTotal SumTotal
1 TSC First 4 10 1520.00
1 TSC Second 6 10 5769.02
15 FH First 3 3 29346.78
20 FH First 6 12 13316.89
20 FH Second 6 12 11932.68
96 FH First 3 3 10173.05
110 MO First 6 12 13243.50
110 MO Second 6 12 17131.94
121 FH First 6 12 111189.07
121 FH Second 6 12 84123.52
130 WMB First 6 12 79463.10
130 WMB Second 6 12 120010.57
143 MO First 5 11 7035.88
143 MO Second 6 11 11520.00
I want to convert some of the rows to columns to make the dataset look like this:
SR Area countTotal First.Count Second.Count First.SumTotal Second.SumTotal
1 TSC 10 4 6 1520.00 5769.02
15 FH 3 3 NA 29346.78 NA
20 FH 12 6 6 13316.89 11932.68
96 FH 3 3 NA 10173.05 NA
110 MO 12 6 6 13243.50 17131.94
121 FH 12 6 6 111189.07 84123.52
130 WMB 12 6 6 79463.10 120010.57
143 MO 11 5 6 7035.88 11520.00
I was trying to use spread from tidyr with this code
test %>% spread(Period, SumTotal) but I still get two lines for each SR and Area.
Can someone help?
You need to first gather by the columns you want to spread, and combine the Period column with the variable column, then spread the resulting variable column:
library(dplyr)
library(tidyr)
test %>%
gather(variable, value, count:SumTotal) %>%
unite("variable", Period, variable, sep = ".") %>%
spread(variable, value)
Result:
SR Area First.count First.countTotal First.SumTotal Second.count Second.countTotal
1 1 TSC 4 10 1520.00 6 10
2 15 FH 3 3 29346.78 NA NA
3 20 FH 6 12 13316.89 6 12
4 96 FH 3 3 10173.05 NA NA
5 110 MO 6 12 13243.50 6 12
6 121 FH 6 12 111189.07 6 12
7 130 WMB 6 12 79463.10 6 12
8 143 MO 5 11 7035.88 6 11
Second.SumTotal
1 5769.02
2 NA
3 11932.68
4 NA
5 17131.94
6 84123.52
7 120010.57
8 11520.00