Related
Suppose I have a dataframe:
dw <- read.table(header=T, text='
ID q1 q2 q3 q4 q5 ...q10
A 10 6 50 10 bA
B 12 5 70 11 bB
C 20 7 20 8 bC
D 22 8 22 9 bD
')
I would like to move every 2 columns after 'ID' to new rows so it looks like:
ID q1 q2
A 10 6
B 12 5
C 20 7
D 22 8
A 50 10
B 70 11
C 20 8
D 22 9
....
pivot_longer seems to move every single column instead of multiple columns?
It seems that you are not concerned with the column names (other than ID), and that they are all the same class. For this, we can "pivot" manually, without the safeguards or power of pivot_lower perhaps, but without the requirements as well.
The first step is to make sure that class won't be an issue; because you have some strings in there, we need to convert all to character:
dw[-1] <- lapply(dw[-1], as.character)
After that, we can manually extract every two (non-ID) columns and combine with ID:
cols <- seq_along(dw)[-1]
list_of_frames <- lapply(split(cols, cols %/% 2), function(ind) setNames(dw[,c(1, ind)], c("ID", "q1", "q2")))
list_of_frames
# $`1`
# ID q1 q2
# 1 A 10 6
# 2 B 12 5
# 3 C 20 7
# 4 D 22 8
# $`2`
# ID q1 q2
# 1 A 50 10
# 2 B 70 11
# 3 C 20 8
# 4 D 22 9
# $`3`
# ID q1 q2
# 1 A bA zA
# 2 B bB zB
# 3 C bC zC
# 4 D bD zD
This can be easily combined with several methods, choose one of:
data.table::rbindlist(list_of_frames)
dplyr::bind_rows(list_of_frames)
do.call(rbind, list_of_frames)
# ID q1 q2
# 1 A 10 6
# 2 B 12 5
# 3 C 20 7
# 4 D 22 8
# 5 A 50 10
# 6 B 70 11
# 7 C 20 8
# 8 D 22 9
# 9 A bA zA
# 10 B bB zB
# 11 C bC zC
# 12 D bD zD
Data
dw <- structure(list(ID = c("A", "B", "C", "D"), q1 = c("10", "12", "20", "22"), q2 = c("6", "5", "7", "8"), q3 = c("50", "70", "20", "22"), q4 = c("10", "11", "8", "9"), q5 = c("bA", "bB", "bC", "bD"), q6 = c("zA", "zB", "zC", "zD")), row.names = c(NA, -4L), class = "data.frame")
Another option:
data.frame(ID = dw$ID,
q1 = unlist(dw[,seq(2, ncol(dw), 2)], use.names = FALSE),
q2 = unlist(dw[,seq(3, ncol(dw), 2)], use.names = FALSE))
With data:
dw <- structure(list(ID = c("A", "B", "C", "D"),
q1 = c(10L, 12L, 20L, 22L),
q2 = c(6L, 5L, 7L, 8L),
q3 = c(50L, 70L, 20L, 22L),
q4 = c(10L, 11L, 8L, 9L),
q5 = c("bA", "bB", "bC", "bD"),
q6 = c("cc", "dd", "ee", "ff"))
, class = "data.frame", row.names = c(NA, -4L))
data.frame(ID = dw$ID,
q1 = unlist(dw[,seq(2, ncol(dw), 2)], use.names = FALSE),
q2 = unlist(dw[,seq(3, ncol(dw), 2)], use.names = FALSE))
#> ID q1 q2
#> 1 A 10 6
#> 2 B 12 5
#> 3 C 20 7
#> 4 D 22 8
#> 5 A 50 10
#> 6 B 70 11
#> 7 C 20 8
#> 8 D 22 9
#> 9 A bA cc
#> 10 B bB dd
#> 11 C bC ee
#> 12 D bD ff
Or more generally:
n <- 3L # operate on every 3 columns
data.frame(
setNames(
c(
list(dw[,1]),
lapply(
2:(n + 1L),
function(i) unlist(dw[,seq(i, ncol(dw), n)], TRUE, FALSE)
)
),
names(dw)[1:(n + 1L)]
)
)
#> ID q1 q2 q3
#> 1 A 10 6 50
#> 2 B 12 5 70
#> 3 C 20 7 20
#> 4 D 22 8 22
#> 5 A 10 bA cc
#> 6 B 11 bB dd
#> 7 C 8 bC ee
#> 8 D 9 bD ff
The melt(...) method for data.table allows for melting groups of columns. Using dw from #r2evans answer:
library(data.table)
setDT(dw)
result <- melt(dw, measure.vars = list(seq(2, ncol(dw), 2), seq(3, ncol(dw), 2)))
result[, variable:=NULL]
result
## ID value1 value2
## 1: A 10 6
## 2: B 12 5
## 3: C 20 7
## 4: D 22 8
## 5: A 50 10
## 6: B 70 11
## 7: C 20 8
## 8: D 22 9
## 9: A bA zA
## 10: B bB zB
## 11: C bC zC
## 12: D bD zD
melt(...) introduces a column variable which keeps track of the location of the original columns in the wide dataset. You don't seem to care about that so it's removed. If there are indeed different classes (integer, character) melt(...) will take care of that with a warning.
I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
Company_Name Company_Location Area_code Fund_1 Fund_2
A SGD 15 10 NA
B LDN 85 NA 4
C NY 54 3 NA
D SGD 15 NA 6
E LDN 85 4 5
dat <- structure(list(Company_Name = c("A", "B", "C", "D", "E"), Company_Location = c("SGD",
"LDN", "NY", "SGD", "LDN"), Area_code = c(15L, 85L, 54L, 15L,
85L), Fund_1 = c(10L, NA, 3L, NA, 4L), Fund_2 = c(NA, 4L, NA,
6L, 5L)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
Company_Name Company_Location Area_code Fund_1 Fund_2
A SGD 15 0.588 NA
B LDN 85 NA 0.267
C NY 54 0.176 NA
D SGD 15 NA 0.400
E LDN 85 0.235 0.333
I tried the following without success.
DF <- apply(DF[,4:5],2,function(x){x/sum(x)})
Your problem is that sum refuses to work du to the NAs, you need na.rm=TRUE.
apply(DF[, 4:5], 2, function(x) {x/sum(x, na.rm=TRUE)})
However, since R is vectorized, you may use colSums.
DF[4:5] <- DF[4:5] / colSums(DF[4:5], na.rm=TRUE)
DF
# Company_Name Company_Location Area_code Fund_1 Fund_2
# 1 A SGD 15 0.5882353 NA
# 2 B LDN 85 NA 0.2352941
# 3 C NY 54 0.1764706 NA
# 4 D SGD 15 NA 0.3529412
# 5 E LDN 85 0.2352941 0.3333333
Here's a tidyverse solution:
mutate(dat, across(c("Fund_1", "Fund_2"), ~. / sum(., na.rm = TRUE)))
I have a large data set:
> ncol(d) [1] 1680 nrow(d) [1] 12
that it looks like this:
a b c e f g
3 2 5 1 3 6
a b c d e g
1 7 8 4 5 8
a c d e f h #in this row b does not exist
5 10 4 7 5 10
And I need that it looks like this:
a b c d e f g h
3 2 5 0 3 6 10 8
1 7 8 4 5 0 8 0
5 0 10 4 7 5 0 10 #and all the other columns ...
Since my data is really long and I have many corrections like this one to do over all the data set, it is hard to do it by hand. I would like to know if there is any way to do this using some sort of automatic way, like a logic function or a loop.
Any idea is welcome
Regards
Here's a possible approach using data.table:
library(data.table)
melt(
setDT(
setnames(
data.table::transpose(df1),
paste(rep(1:(nrow(df1)/2), each = 2), c("name", "value"), sep = "_"))),
measure = patterns("name", "value"))[
, dcast(.SD, variable ~ value1, value.var = "value2", fill = 0)]
# variable a b c d e f g h
# 1: 1 3 2 5 0 1 3 6 0
# 2: 2 1 7 8 4 5 0 8 0
# 3: 3 5 0 10 4 7 5 0 10
We could get the alternate rows with recycling logical vector, construct a data.frame and pivot it to wide format with pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
sub1 <- df1[c(TRUE, FALSE),]
sub2 <- df1[c(FALSE, TRUE),]
tibble(ind = c(row(sub1)), col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))) %>%
pivot_wider(names_from = col1, values_from = col2,
values_fill = list(col2 = 0)) %>%
select(-ind)
#A tibble: 3 x 8
# a b c d e f g h
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
Or using base R with reshape
out <- reshape(
data.frame(ind = c(row(sub1)),
col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))),
idvar = 'ind', direction = 'wide', timevar = 'col1')[-1]
names(out) <- sub("col2\\.", "", names(out))
out[is.na(out)] <- 0
row.names(out) <- NULL
out
# a b c d e f g h
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
data
df1 <- structure(list(v1 = c("a", "3", "a", "1", "a", "5"), v2 = c("b",
"2", "b", "7", "c", "10"), v3 = c("c", "5", "c", "8", "d", "4"
), v4 = c("e", "1", "d", "4", "e", "7"), v5 = c("f", "3", "e",
"5", "f", "5"), v6 = c("g", "6", "g", "8", "h", "10")), class = "data.frame",
row.names = c(NA,
-6L))
I have a data frame that is sorted based on one column(numeric column) to assign the rank. if this column value is zero then arrange the data frame based on another character column for those rows which have zero as a value in a numeric column.
But to give rank I have to consider var2 that is the reason I sorted based on var2, if there is any identical values in var2 for those rows I have to consider var3 to give rank. please see the data frame 2 and 3 rows, var2 values are identical in that case i have to consider var3 to give rank. In case var2 is zero i have to sort the var1 column(character column) in alphabetical order and give rank. if var2 is NA no rank. please refer the data frame given below.
Below, the data frame is sorted based on var2 column descending order, but var2 contains zero also if var2 is zero I have to sort the data frame based on var1 for the rows which are having zero in var2. I need sort by var1 for those rows which are having var2 as zero and followed by NA in alphabetical order of var1.
example:
# var1 var2 var3 rank
# 1 c 556 45 1
# 2 a 345 35 3
# 3 f 345 64 2
# 4 b 134 87 4
# 5 z 0 34 5
# 6 d 0 32 6
# 7 c 0 12 7
# 8 a 0 23 8
# 9 e NA
# 10 b NA
below is my code
df <- data.frame(var1=c("c","a","f","b","z","d", "c","a", "e", "b", "ad", "gf", "kg", "ts", "mp"), var2=c(134, NA,345, 200, 556,NA, 345, 200, 150, 0, 25,10,0,150,0), var3=c(65,'',45,34,68,'',73,12,35,23,34,56,56,78,123))
# To break the tie between var3 and var2
orderdf <- df[order(df$var2, df$var1, decreasing = TRUE), ]
#assigning rank
rankdf <- orderdf %>% mutate(rank = ifelse(is.na(var2),'', seq(1:nrow(orderdf))))
expected output is sort the var1 in alphabetical order if var2 value is zero(for those rows with var2 value is zero)
expected output:
# var1 var2 var3 rank
# 1 c 556 45 1
# 2 a 345 35 3
# 3 f 345 64 2
# 4 b 134 87 4
# 5 a 0 34 5
# 6 c 0 32 6
# 7 d 0 12 7
# 8 z 0 23 8
# 9 b NA
# 10 e NA
With dplyr you can use
df %>%
arrange(desc(var2), var1)
and afterwards you create the column rank
EDIT
The following code is a bit cumbersome but it gets the job done. Basically it orders the rows in which var2 is equal or different from zero separately, then combines the two ordered dataframes together and finally creates the rank column.
Data
df <- data.frame(
var1 = c("c","a","f","b","z","d", "c","a", "e", "z", "ad", "gf", "kg", "ts", "mp"),
var2 = c(134, NA,345, 200, 556,NA, 345, 200, 150, 0, 25,10,0,150,0),
var3 = as.numeric(c(65,'',45,34,68,'',73,12,35,23,34,56,56,78,123))
)
df
# var1 var2 var3
# 1 c 134 65
# 2 a NA NA
# 3 f 345 45
# 4 b 200 34
# 5 z 556 68
# 6 d NA NA
# 7 c 345 73
# 8 a 200 12
# 9 e 150 35
# 10 z 0 23
# 11 ad 25 34
# 12 gf 10 56
# 13 kg 0 56
# 14 ts 150 78
# 15 mp 0 123
Code
df %>%
# work on rows with var2 different from 0 or NA
filter(var2 != 0) %>%
arrange(desc(var2), desc(var3)) %>%
# merge with rows with var2 equal to 0 or NA
bind_rows(df %>% filter(var2 == 0 | is.na(var2)) %>% arrange(var1)) %>%
arrange(desc(var2)) %>%
# create the rank column only for the rows with var2 different from NA
mutate(
rank = seq_len(nrow(df)),
rank = ifelse(is.na(var2), NA, rank)
)
Output
# var1 var2 var3 rank
# 1 z 556 68 1
# 2 c 345 73 2
# 3 f 345 45 3
# 4 b 200 34 4
# 5 a 200 12 5
# 6 ts 150 78 6
# 7 e 150 35 7
# 8 c 134 65 8
# 9 ad 25 34 9
# 10 gf 10 56 10
# 11 kg 0 56 11
# 12 mp 0 123 12
# 13 z 0 23 13
# 14 a NA NA NA
# 15 d NA NA NA
Using only base R's order() function, sort first on descending order of var2 then ascending order of var1 to sort the data by passing the subsequent integer vector to square braces
df[order(-df$var2, df$var1), ]
Adding a rank column too is then just
df[order(-df$var2, df$var1), "rank"] <- 1:length(df$var1)
Using data.table
library(data.table)
setDT(df)[order(-var2, var1)][, rank := seq_len(.N)][]
data
df <- structure(list(var1 = structure(c(3L, 1L, 6L, 2L, 7L, 4L, 3L,
1L, 5L, 2L), .Label = c("a", "b", "c", "d", "e", "f", "z"), class = "factor"),
var2 = c(1456L, 456L, 345L, 134L, 0L, 0L, 0L, 0L, NA, NA)),
class = "data.frame", row.names = c(NA, -10L))
You can do it in base R, using order :
cols <- c('var1', 'var2')
remaining_cols <- setdiff(names(df), cols)
df1 <- df[cols]
cbind(transform(df1[with(df1, order(-var2, var1)), ],
rank = seq_len(nrow(df1))), df[remaining_cols])
# var1 var2 rank var3
#1 c 556 1 45
#2 a 345 2 35
#3 f 345 3 64
#4 b 134 4 87
#8 a 0 5 34
#7 c 0 6 32
#6 d 0 7 12
#5 z 0 8 23
#10 b NA 9 10
#9 e NA 10 11
data
df <- structure(list(var1 = structure(c(3L, 1L, 6L, 2L, 7L, 4L, 3L,
1L, 5L, 2L), .Label = c("a", "b", "c", "d", "e", "f", "z"), class = "factor"),
var2 = c(556L, 345L, 345L, 134L, 0L, 0L, 0L, 0L, NA, NA),
var3 = c(45L, 35L, 64L, 87L, 34L, 32L, 12L, 23L, 10L, 11L
)), class = "data.frame", row.names = c(NA, -10L))
I have a dataframe as follows:
df1
ColA ColB ColC ColD
10 A B L
11 N Q NA
12 P J L
43 M T NA
89 O J T
df2
ATTR Att R1 R2 R3 R4
1 45 A B NA NA
2 40 C D NA NA
3 33 T J O NA
4 65 L NA NA NA
5 20 P L J NA
6 23 Q NA NA NA
7 38 Q L NA NA
How do I match up df2 with df1 so that if ALL the values in each df2 row (disregarding the order) show up in the df1 rows, then it will populate. So it is checking if ALL not just one value from each df2 row matches up with each df1 row. The final result in this case should be this:
ColA ColB ColC ColD ATTR Att R1 R2 R3 R4
10 A B L 1 45 A B NA NA
10 A B L 4 65 L NA NA NA
11 N Q NA 6 23 Q NA NA NA
12 P J L 4 65 L NA NA NA
12 P J L 5 20 P L J NA
89 O J T 3 33 T J O NA
Thanks
Here is a possible solution using base R.
Make sure everything is a character before continuing, i.e.
df[-1] <- lapply(df[-1], as.character)
df1[-c(1:2)] <- lapply(df1[-c(1:2)], as.character)
First we create two lists which contain vectors of the rowwise elements of each data frame. We then create a matrix with the length of elements from l2 are found in l1, If the length is 0 then it means they match. i.e,
l1 <- lapply(split(df[-1], seq(nrow(df))), function(i) i[!is.na(i)])
l2 <- lapply(split(df1[-c(1:2)], seq(nrow(df1))), function(i) i[!is.na(i)])
m1 <- sapply(l1, function(i) sapply(l2, function(j) length(setdiff(j, i))))
m1
# 1 2 3 4 5
#1 0 2 2 2 2
#2 2 2 2 2 2
#3 3 3 2 2 0
#4 0 1 0 1 1
#5 2 3 0 3 2
#6 1 0 1 1 1
#7 1 1 1 2 2
We then use that matrix to create a couple of coloumns in our original df. The first column rpt will indicate how many times each row has length 0 and use that as a number of repeats for each row. We also use it to filter out all the 0 lengths (i.e. the rows that do not have a match with df1). After expanding the data frame we create another variable; ATTR (same name as ATTR in df1) in order to use it for a merge. i.e.
df$rpt <- colSums(m1 == 0)
df <- df[df$rpt != 0,]
df <- df[rep(row.names(df), df$rpt),]
df$ATTR <- which(m1 == 0, arr.ind = TRUE)[,1]
df
# ColA ColB ColC ColD rpt ATTR
#1 10 A B L 2 1
#1.1 10 A B L 2 4
#2 11 N Q <NA> 1 6
#3 12 P J L 2 4
#3.1 12 P J L 2 5
#5 89 O J T 1 3
We then merge and order the two data frames,
final_df <- merge(df, df1, by = 'ATTR')
final_df[order(final_df$ColA),]
# ATTR ColA ColB ColC ColD rpt Att R1 R2 R3 R4
#1 1 10 A B L 2 45 A B <NA> <NA>
#3 4 10 A B L 2 65 L <NA> <NA> <NA>
#6 6 11 N Q <NA> 1 23 Q <NA> <NA> <NA>
#4 4 12 P J L 2 65 L <NA> <NA> <NA>
#5 5 12 P J L 2 20 P L J <NA>
#2 3 89 O J T 1 33 T J O <NA>
DATA
dput(df)
structure(list(ColA = c(10L, 11L, 12L, 43L, 89L), ColB = c("A",
"N", "P", "M", "O"), ColC = c("B", "Q", "J", "T", "J"), ColD = c("L",
NA, "L", NA, "T")), .Names = c("ColA", "ColB", "ColC", "ColD"
), row.names = c(NA, -5L), class = "data.frame")
dput(df1)
structure(list(ATTR = 1:7, Att = c(45L, 40L, 33L, 65L, 20L, 23L,
38L), R1 = c("A", "C", "T", "L", "P", "Q", "Q"), R2 = c("B",
"D", "J", NA, "L", NA, "L"), R3 = c(NA, NA, "O", NA, "J", NA,
NA), R4 = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("ATTR",
"Att", "R1", "R2", "R3", "R4"), row.names = c(NA, -7L), class = "data.frame")