Rearrange data by matching columns

Rearrange data by matching columns - r

I am having issue with rearranging some data.
The original data is:
structure(list(id = 1:3, artery.1 = structure(c(1L, 1L, 2L), .Label = c("a",
"b"), class = "factor"), artery.2 = structure(c(1L, NA, 2L), .Label = c("b",
"c"), class = "factor"), artery.3 = structure(c(1L, NA, 2L), .Label = c("c",
"d"), class = "factor"), artery.4 = structure(c(NA, NA, 1L), .Label = "e", class = "factor"), artery.5 = structure(c(NA, NA, 1L), .Label = "f", class = "factor"),
diameter.1 = c(3L, 2L, 1L), diameter.2 = c(2L, NA, 2L), diameter.3 = c(3L,
NA, 3L), diameter.4 = c(NA, NA, 4L), diameter.5 = c(NA, NA,
5L)), .Names = c("id", "artery.1", "artery.2", "artery.3",
"artery.4", "artery.5", "diameter.1", "diameter.2", "diameter.3",
"diameter.4", "diameter.5"), class = "data.frame", row.names = c(NA,
-3L))
# id artery.1 artery.2 artery.3 artery.4 artery.5 diameter.1 diameter.2 diameter.3 diameter.4 diameter.5
# 1 1 a b c <NA> <NA> 3 2 3 NA NA
# 2 2 a <NA> <NA> <NA> <NA> 2 NA NA NA NA
# 3 3 b c d e f 1 2 3 4 5
I would like to get to this:
structure(list(id = 1:3, a = c(3L, 2L, NA), b = c(2L, NA, 1L),
c = c(3L, NA, 2L), d = c(NA, NA, 3L), e = c(NA, NA, 4L),
f = c(NA, NA, 5L)), .Names = c("id", "a", "b", "c", "d",
"e", "f"), class = "data.frame", row.names = c(NA, -3L))
# id a b c d e f
# 1 1 3 2 3 NA NA NA
# 2 2 2 NA NA NA NA NA
# 3 3 NA 1 2 3 4 5
Basically, a to f represents arteries and the numerical values represent the corresponding diameter. Each row represents a patient.
Is there a neat way to sort this dataframe out?

Modern tidyr makes the solution even more succinct via the pivot_ functions:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-id, names_pattern = '(artery|diameter)\\.(\\d+)', names_to = c('.value', NA)) %>%
filter(!is.na(artery)) %>%
pivot_wider(names_from = artery, values_from = diameter)
id a b c d e f
<int> <int> <int> <int> <int> <int> <int>
1 1 3 2 3 NA NA NA
2 2 2 NA NA NA NA NA
3 3 NA 1 2 3 4 5
Here is the older solution, which uses the deprecated gather and spread functions:
library(dplyr)
library(tidyr)
new.df <- gather(df, variable, value, artery.1:diameter.5) %>%
separate(variable, c('variable', 'num')) %>%
spread(variable, value) %>%
subset(!is.na(artery)) %>%
mutate(diameter = as.numeric(diameter)) %>%
select(-num) %>%
spread(artery, diameter)
Output:
id a b c d e f
1 1 3 2 3 NA NA NA
2 2 2 NA NA NA NA NA
3 3 NA 1 2 3 4 5

Or using melt/dcast combination with data.table while selecting variables using regex in the patterns function
library(data.table) #v>=1.9.6
dcast(melt(setDT(df),
id = "id",
measure = patterns("artery", "diameter")),
id ~ value1,
sum,
value.var = "value2",
subset = .(!is.na(value2)),
fill = NA)
# id a b c d e f
# 1: 1 3 2 3 NA NA NA
# 2: 2 2 NA NA NA NA NA
# 3: 3 NA 1 2 3 4 5
As you can see, both melt and dcast are very flexible and you can use regex, specify a subset, pass multiple functions and specify how you want to fill missing values.

You can use xtabs with reshape from base R. Use the latter to transform data to long format and use the former to get the count table:
xtabs(diameter ~ id + artery, reshape(df, varying = 2:11, sep = '.', dir = "long"))
# artery
#id a b c d e f
# 1 3 2 3 0 0 0
# 2 2 0 0 0 0 0
# 3 0 1 2 3 4 5

This can be done with two reshape() calls. First, we can longify both artery and diameter on id, then widen with artery as the time variable. To prevent a column of NAs, we also must subset out rows with NA values for artery in the intermediate frame.
reshape(subset(reshape(df,dir='l',varying=setdiff(names(df),'id'),timevar=NULL),!is.na(artery)),dir='w',timevar='artery');
## id diameter.a diameter.b diameter.c diameter.d diameter.e diameter.f
## 1.1 1 3 2 3 NA NA NA
## 2.1 2 2 NA NA NA NA NA
## 3.1 3 NA 1 2 3 4 5
The diameter. prefixes can be removed afterward, if desired. However, an advantage of this solution is that it would be capable of preserving multiple column sets, whereas the xtabs() solution cannot. The prefixes would be essential to distinguish the column sets in that case.

Related

Join similar observations within a data.frame with R

I want to mix several observations in a data.frame using as a reference one constantly repeated variable.
Example:
id var1 var2 var3
a 1 na na
a na 2 na
a na na 3
b 1 na
b na 2 na
b na na na
c na na 3
c na 2 na
c 1 na na
Expected result:
id var1 var2 var3
a 1 2 3
b 1 2 na
c 1 2 3

A possible solution (replacing "na" by NA with na_if):
library(tidyverse)
df %>%
na_if("na") %>%
group_by(id) %>%
summarize(across(var1:var3, ~ sort(.x)[1]))
#> # A tibble: 3 × 4
#> id var1 var2 var3
#> <chr> <chr> <chr> <chr>
#> 1 a 1 2 3
#> 2 b 1 2 <NA>
#> 3 c 1 2 3

Assumptions:
"na" above is really R's native NA (not a string);
b's first row, var2 should be NA instead of an empty string ""
perhaps from the above, var1:var3 should be numbers
either you will never have a group where there is more than one non-NA in a group/column, or you don't care about anything other than the first and want the remaining discarded
library(dplyr)
dat %>%
group_by(id) %>%
summarize(across(everything(), ~ na.omit(.)[1]))
# # A tibble: 3 x 4
# id var1 var2 var3
# <chr> <int> <int> <int>
# 1 a 1 2 3
# 2 b 1 2 NA
# 3 c 1 2 3
Data
dat <- structure(list(id = c("a", "a", "a", "b", "b", "b", "c", "c", "c"), var1 = c(1L, NA, NA, 1L, NA, NA, NA, NA, 1L), var2 = c(NA, 2L, NA, NA, 2L, NA, NA, 2L, NA), var3 = c(NA, NA, 3L, NA, NA, NA, 3L, NA, NA)), class = "data.frame", row.names = c(NA, -9L))

Assuming that your data has NA, you can use the following base R option using the Data from #r2evans (thanks!):
aggregate(.~id, dat, mean, na.rm = TRUE, na.action=NULL)
Output:
id var1 var2 var3
1 a 1 2 3
2 b 1 2 NaN
3 c 1 2 3

Aggregating rows across multiple values

I have a large dataframe with approximately this pattern:
Person
Rate
Street
a
b
c
d
e
f
A
2
XYZ
1
NULL
3
4
5
NULL
A
2
XYZ
NULL
2
NULL
NULL
NULL
NULL
A
3
XYZ
NULL
NULL
NULL
NULL
NULL
6
B
2
DEF
NULL
NULL
NULL
NULL
5
NULL
B
2
DEF
NULL
2
3
NULL
NULL
6
C
1
DEF
1
2
3
4
5
6
A, b, c, d, e, f represents about 600 columns.
I am trying to combine the columns so that each person becomes one line, rows a-f combine into a single line using sum, and any conflicting rate or street information becomes a new row. So the data should look something like this:
Person
Rate
Rate 2
Street
a
b
c
d
e
f
A
2
3
XYZ
1
2
3
4
5
6
B
2
DEF
NULL
2
3
NULL
5
6
C
1
DEF
1
2
3
4
5
6
I keep trying to make this work with aggregate and summarize but I'm not sure that's the right approach.
Thank you very much for your help!

First we pivot all the unique rates per person and street.
library(reshape2)
tmp1=dcast(unique(df[,c("Person","Rate","Street")]),Person+Street~Rate,value.var="Rate")
colnames(tmp1)[-c(1:2)]=paste("Rate",colnames(tmp1)[-c(1:2)])
Then we aggregate and sum by person and rate, columns 4 to 9, from "a" to "f", change accordingly.
tmp2=aggregate(df[,4:9],list(Person=df$Person,Street=df$Street),function(x){
ifelse(all(is.na(x)),NA,sum(x,na.rm=T))
})
And finally merge the two.
merge(tmp1,tmp2,by=c("Person","Street"))
Person Street Rate 1 Rate 2 Rate 3 a b c d e f
1 A XYZ NA 2 3 1 2 3 4 5 6
2 B DEF NA 2 NA NA 2 3 NA 5 6
3 C DEF 1 NA NA 1 2 3 4 5 6

Perhaps, you can do this in two-step process -
library(dplyr)
library(tidyr)
#sum columns a-f
table1 <- df %>%
group_by(Person) %>%
summarise(across(a:f, sum, na.rm = TRUE))
#Remove duplicated values and get the data in separate columns
#for Rate and Street columns.
table2 <- df %>%
group_by(Person) %>%
mutate(across(c(Rate, Street), ~replace(., duplicated(.), NA))) %>%
select(Person, Rate, Street) %>%
filter(if_any(c(Rate, Street), ~!is.na(.))) %>%
mutate(col = row_number()) %>%
ungroup %>%
pivot_wider(names_from = col, values_from = c(Rate, Street)) %>%
select(where(~any(!is.na(.))))
#Join the two data to get final result
inner_join(table1, table2, by = 'Person')
# Person a b c d e f Rate_1 Rate_2 Street_1
# <chr> <int> <int> <int> <int> <int> <int> <int> <int> <chr>
#1 A 1 2 3 4 5 6 2 3 XYZ
#2 B 0 2 3 0 5 6 2 NA DEF
#3 C 1 2 3 4 5 6 1 NA DEF
data
It is helpful and easier to help when you share data in a reproducible format which can be copied directly. I have used the below data for the answer.
df <- structure(list(Person = c("A", "A", "A", "B", "B", "C"), Rate = c(2L,
2L, 3L, 2L, 2L, 1L), Street = c("XYZ", "XYZ", "XYZ", "DEF", "DEF",
"DEF"), a = c(1L, NA, NA, NA, NA, 1L), b = c(NA, 2L, NA, NA,
2L, 2L), c = c(3L, NA, NA, NA, 3L, 3L), d = c(4L, NA, NA, NA,
NA, 4L), e = c(5L, NA, NA, 5L, NA, 5L), f = c(NA, NA, 6L, NA,
6L, 6L)), row.names = c(NA, -6L), class = "data.frame")

Sum many rows with some of them have NA in all needed columns

I am trying to do rowSums but I got zero for the last row and I need it to be "NA".
My df is
a b c sum
1 1 4 7 12
2 2 NA 8 10
3 3 5 NA 8
4 NA NA NA NA
I used this code based on this link; Sum of two Columns of Data Frame with NA Values
df$sum<-rowSums(df[,c("a", "b", "c")], na.rm=T)
Any advice will be greatly appreciated

For each row check if it is all NA and if so return NA; otherwise, apply sum. We have selected columns a, b and c even though that is all the columns because the poster indicated that there might be additional ones.
sum_or_na <- function(x) if (all(is.na(x))) NA else sum(x, na.rm = TRUE)
transform(df, sum = apply(df[c("a", "b", "c")], 1, sum_or_na))
giving:
a b c sum
1 1 4 7 12
2 2 NA 8 10
3 3 5 NA 8
4 NA NA NA NA
Note
df in reproducible form is assumed to be:
df <- structure(list(a = c(1L, 2L, 3L, NA), b = c(4L, NA, 5L, NA),
c = c(7L, 8L, NA, NA)),
row.names = c("1", "2", "3", "4"), class = "data.frame")

Summation of the corresponding number of values which are in different columns

My data frame looks like below:
df<-data.frame(alphabets1=c("A","B","C","B","C"," ","NA"),alphabets2=c("B","A","D","D"," ","E","NA"),alphabets3=c("C","F","G"," "," "," ","NA"), number = c("1","2","3","1","4","1","2"))
alphabets1 alphabets2 alphabets3 number
1 A B C 1
2 B A F 2
3 C D G 3
4 B D 1
5 C 4
6 E 1
7 NA NA NA 2
NOTE1: within the row all the values are unique, that is, below shown is not possible.
alphabets1 alphabets2 alphabets3 number
1 A A C 1
NOTE2: data frame may contains NA or is blank
I am struggling to get the below output: which is nothing but a dataframe which has the alphabets and the sum of their corresponding numbers, that is A alphabet is in 1st and 2nd rows so its sum of its corresponding number is 1+2 i.e 3 and let's say B, its in 1st, 2nd and 4th row so the sum will be 1+2+1 i.e 4.
output <-data.frame(alphabets1=c("A","B","C","D","E","F","G"), number = c("3","4","8","4","1","2","3"))
output
alphabets number
1 A 3
2 B 4
3 C 8
4 D 4
5 E 1
6 F 2
7 G 3
NOTE3: output may or may not have the NA or blanks (it doesn't matter!)

We can reshape it to 'long' format and do a group by operation
library(data.table)
melt(setDT(df), id.var="number", na.rm = TRUE, value.name = "alphabets1")[
!grepl("^\\s*$", alphabets1), .(number = sum(as.integer(as.character(number)))),
alphabets1]
# alphabets1 number
#1: A 3
#2: B 4
#3: C 8
#4: D 4
#5: E 1
#6: F 2
#7: G 3
Or we can use xtabs from base R
xtabs(number~alphabets1, data.frame(alphabets1 = unlist(df[-4]),
number = as.numeric(as.character(df[,4]))))
NOTE: In the OP's dataset, the missing values were "NA", and not real NA and the 'number' column is factor (which was changed by converting to integer for doing the sum)
data
df <- data.frame(alphabets1=c("A","B","C","B","C"," ",NA),
alphabets2=c("B","A","D","D"," ","E",NA),
alphabets3=c("C","F","G"," "," "," ",NA),
number = c("1","2","3","1","4","1","2"))

Here is a base R method using sapply and table. I first converted df$number into a numeric. See data section below.
data.frame(table(sapply(df[-length(df)], function(i) rep(i, df$number))))
Var1 Freq
1 11
2 A 3
3 B 4
4 C 8
5 D 4
6 E 1
7 F 2
8 G 3
9 NA 6
To make the output a little bit nicer, we could wrap a few more functions and perform a subsetting within sapply.
data.frame(table(droplevels(unlist(sapply(df[-length(df)],
function(i) rep(i[i %in% LETTERS],
df$number[i %in% LETTERS])),
use.names=FALSE))))
Var1 Freq
1 A 3
2 B 4
3 C 8
4 D 4
5 E 1
6 F 2
7 G 3
It may be easier to do this afterward, though.
data
I ran
df$number <- as.numeric(df$number)
on the OP's data resulting in this.
df <-
structure(list(alphabets1 = structure(c(2L, 3L, 4L, 3L, 4L, 1L,
5L), .Label = c(" ", "A", "B", "C", "NA"), class = "factor"),
alphabets2 = structure(c(3L, 2L, 4L, 4L, 1L, 5L, 6L), .Label = c(" ",
"A", "B", "D", "E", "NA"), class = "factor"), alphabets3 = structure(c(2L,
3L, 4L, 1L, 1L, 1L, 5L), .Label = c(" ", "C", "F", "G", "NA"
), class = "factor"), number = c(1, 2, 3, 1, 4, 1, 2)), .Names = c("alphabets1",
"alphabets2", "alphabets3", "number"), row.names = c(NA, -7L), class = "data.frame")

Restructuring data using apply family of functions

I have inherited a data set that is 23 attributes measured for each of 13 names (between-subjects--each participant only rated one name on all of these attributes). Right now it's structured such that the attributes are the fastest-moving factor, followed by the name. So the the data look like this:
Sub# N1-item1 N1-item2 N1-item3 […] N2-item1 N2-item2 N2-item3
1 3 5 3 NA NA NA
2 NA NA NA 1 5 3
3 3 5 3 NA NA NA
4 NA NA NA 2 2 1
It needs to be restructured it such that it's collapsed over name, and all of the item1 entries are the same column (subjects don't matter for this purpose), as below (bearing in mind that there are 23 items not 3 and 13 names not 2):
Name item1 item2 item3
N1 3 5 3
N2 1 5 3
I can do this with loops and, but I'd rather do it in a manner more natural to R, which I'm guessing would be one of the apply family of functions, but I can't quite wrap my head around it--what is the smart way to do this?

Here's an answer using dplyr and tidyr:
library(dplyr)#loads libraries
library(tidyr)
dat %>% #name of your dataframe
gather(key, val, -Sub) %>% #gathers to long data, with id as Sub
filter(!is.na(val)) %>% #removes rows with NA for the value
separate(key, c("Name", "item")) %>% #split the column key into Name and item
spread(item, val) #spreads the data into wide format, with item as the columns
Sub Name item1 item2 item3
1 1 N1 3 5 3
2 2 N2 1 5 3
3 3 N1 3 5 3
4 4 N2 2 2 1

Spin the column names around to be itemX-NY and then let reshape sort it out:
names(dat)[-1] <- gsub("(^.+?)-(.+?$)", "\\2-\\1", names(dat)[-1])
na.omit(reshape(dat, direction="long", idvar="Sub", varying=-1, sep="-"))
# Sub time item1 item2 item3
#1.N1 1 N1 3 5 3
#3.N1 3 N1 3 5 3
#2.N2 2 N2 1 5 3
#4.N2 4 N2 2 2 1
Where the data was:
dat <- structure(list(Sub = 1:4, `item1-N1` = c(3L, NA, 3L, NA), `item2-N1` = c(5L,
NA, 5L, NA), `item3-N1` = c(3L, NA, 3L, NA), `item1-N2` = c(NA,
1L, NA, 2L), `item2-N2` = c(NA, 5L, NA, 2L), `item3-N2` = c(NA,
3L, NA, 1L)), .Names = c("Sub", "item1-N1", "item2-N1", "item3-N1",
"item1-N2", "item2-N2", "item3-N2"), row.names = c(NA, -4L), class = "data.frame

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Rearrange data by matching columns - r

You can use xtabs with reshape from base R. Use the latter to transform data to long format and use the former to get the count table: xtabs(diameter ~ id + artery, reshape(df, varying = 2:11, sep = '.', dir = "long")) # artery #id a b c d e f # 1 3 2 3 0 0 0 # 2 2 0 0 0 0 0 # 3 0 1 2 3 4 5

Related

Join similar observations within a data.frame with R

Aggregating rows across multiple values

Sum many rows with some of them have NA in all needed columns

Summation of the corresponding number of values which are in different columns

Restructuring data using apply family of functions

Categories

Resources