I have the following list:
peter <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
john <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
myList <- list(peter, john)
names(myList) <- c("peter", "john")
myList
$peter
year a b
1 1.01464245 0.2490931
2 1.38054309 0.8396630
3 -0.84094830 0.2410526
4 -0.05567379 0.6369121
5 -0.66412862 1.5739672
$john
year a b
1 0.3060996 -0.4256702
2 0.7167710 -0.6828029
3 -0.6896138 0.6577422
4 -1.7647412 -0.5651756
5 0.3065734 -0.4860141
How can I transform myList into the following data frame:
year student a b
1 1 peter 1.01464245 0.2490931
2 2 peter 1.38054309 0.8396630
3 3 peter -0.84094830 0.2410526
4 4 peter -0.05567379 0.6369121
5 5 peter -0.66412862 1.5739672
6 1 john 0.30609964 -0.4256702
7 2 john 0.71677097 -0.6828029
8 3 john -0.68961377 0.6577422
9 4 john -1.76474117 -0.5651756
10 5 john 0.30657340 -0.4860141
Thank you very much.
library(plyr)
dat <- ldply(myList)
colnames(dat) <- c("student", "year", "a", "b")
print(dat)
## student year a b
## 1 peter 1 0.03716519 0.8465317
## 2 peter 2 -1.15449127 1.5461944
## 3 peter 3 0.15933780 0.7468312
## 4 peter 4 0.91745104 0.1113958
## 5 peter 5 -0.22924789 -0.5344617
## 6 john 1 0.40790134 0.5886599
## 7 john 2 -0.88635369 -0.3596063
## 8 john 3 -1.16444277 1.1080161
## 9 john 4 -0.19082412 0.1675609
## 10 john 5 1.19066829 -0.8855810
Another alternative (very similar to Ben's)
> df <- do.call(rbind, myList)
> df <- transform(df, student=sub("\\.[0-9]", "", rownames(df)))[, c("year", "student", "a", "b")]
> rownames(df)<- NULL
> df
year student a b
1 1 peter -0.71040656 -0.04502772
2 2 peter 0.25688371 -0.78490447
3 3 peter -0.24669188 -1.66794194
4 4 peter -0.34754260 -0.38022652
5 5 peter -0.95161857 0.91899661
6 1 john -0.57534696 0.30115336
7 2 john 0.60796432 0.10567619
8 3 john -1.61788271 -0.64070601
9 4 john -0.05556197 -0.84970435
10 5 john 0.51940720 -1.02412879
Similar to Ben's, only a bit different.
dd <- do.call(rbind, myList)
cbind(dd[1], student = sub("[.].*", "", rownames(dd)), dd[2:3], row.names = NULL)
# year student a b
# 1 1 peter -1.66983899 0.3683629
# 2 2 peter 0.25391016 -0.4999335
# 3 3 peter -0.19102468 -0.9344484
# 4 4 peter 1.72821089 -2.6148841
# 5 5 peter 0.30320439 -0.2602509
# 6 1 john -0.02447092 -0.2396401
# 7 2 john -1.57022813 1.1159078
# 8 3 john 2.82545689 0.6818537
# 9 4 john -0.11273218 -1.8000738
# 10 5 john -1.39706920 0.1647720
Update Sept 16, 2015 An improvement on my previous answer:
f <- function(x, y) cbind(x[1], student = y, x[-1])
do.call(rbind, Map(f, myList, names(myList), USE.NAMES = FALSE))
A bit unwieldy, but:
peter <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
john <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
myList <- list(peter=peter, john=john)
do.call(rbind,
mapply(function(student,d) { data.frame(student,d) },
names(myList),myList,SIMPLIFY=FALSE))
define a function that creates a data frame from the student name and the student information;
pass it to mapply() (with SIMPLIFY=FALSE) to get a list of augmented data frames;
do.call(rbind,...) to combine the pieces.
Alternatively:
info <- do.call(rbind,myList)
student <- rep(names(myList),sapply(myList,nrow))
data.frame(student,info)
Related
There should be a fairly simple solution to this but it's giving me trouble. I have a DF similar to this:
> df <- data.frame(name = c("george", "george", "george", "sara", "sara", "sam", "bill", "bill"),
id_num = c(1, 1, 2, 3, 3, 4, 5, 5))
> df
name id_num
1 george 1
2 george 1
3 george 2
4 sara 3
5 sara 3
6 sam 4
7 bill 5
8 bill 5
I'm looking for a way to find rows where the name and ID numbers are inconsistent in a very large dataset. I.e., George should always be "1" but in row three there is a mistake and he has also been assigned ID number "2".
I think the easiest way will be to use dplyr::count twice, hence for your example:
df %>%
count(name, id) %>%
count(name)
The first count will give:
name id n
george 1 2
george 2 1
sara 3 2
sam 4 1
bill 5 2
Then the second count will give:
name n
george 2
sara 1
sam 1
bill 1
Of course, you could add filter(n > 1) to the end of your pipe, too, or arrange(desc(n))
df %>%
count(name, id) %>%
count(name) %>%
arrange(desc(n)) %>%
filter(n > 1)
Using tapply() to calculate number of ID's per name, then subset for greater than 1.
res <- with(df, tapply(id_num, list(name), \(x) length(unique(x))))
res[res > 1]
# george
# 2
You probably want to correct this. A safe way is to rebuild the numeric ID's using as.factor(),
df$id_new <- as.integer(as.factor(df$name))
df
# name id_num id_new
# 1 george 1 2
# 2 george 1 2
# 3 george 2 2
# 4 sara 3 4
# 5 sara 3 4
# 6 sam 4 3
# 7 bill 5 1
# 8 bill 5 1
where numbers are assigned according to the names in alphabetical order, or factor(), reading in the levels in order of appearance.
df$id_new2 <- as.integer(factor(df$name, levels=unique(df$name)))
df
# name id_num id_new id_new2
# 1 george 1 2 1
# 2 george 1 2 1
# 3 george 2 2 1
# 4 sara 3 4 2
# 5 sara 3 4 2
# 6 sam 4 3 3
# 7 bill 5 1 4
# 8 bill 5 1 4
Note: R >= 4.1 used.
Data:
df <- structure(list(name = c("george", "george", "george", "sara",
"sara", "sam", "bill", "bill"), id_num = c(1, 1, 2, 3, 3, 4,
5, 5)), class = "data.frame", row.names = c(NA, -8L))
I am having different files that have a variable that is named differently but has the same string character “type_category” e.g., type_category_lifestyle_characterstics, type_category_uniqueness etc. The idea is to go through these files and rename such variables to type_category. Below are examples of data frames
df1 <- data.frame(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
df2 <- data.frame(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
Thanks in advance
We can get the datasets in a list
library(dplyr)
library(purrr)
out <- map(mget(ls(pattern = '^df\\d+$')), ~ .x %>%
rename_with(~ "type_category",
starts_with("type_category")))
-output
out
$df1
id type_category rating
1 1 5 1
2 2 6 3
3 3 7 4
$df2
id type_category rating
1 9 4 2
2 5 6 7
3 3 1 4
We could use setNames with lapply:
my_list <- list(df1, df2)
colnames <- c("id","type_category","rating")
lapply(my_list, setNames, colnames)
output:
[[1]]
id type_category rating
1 1 5 1
2 2 6 3
3 3 7 4
[[2]]
id type_category rating
1 9 4 2
2 5 6 7
3 3 1 4
Base R
Once you got them in a list, you can use lapply to change the variable names in all of them
df1 <- data.frame(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
df2 <- data.frame(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
lapply(list(df1, df2),
function(df){
nms <- names(df)
nms[grepl(pattern = "type_category",
x = nms,
ignore.case = TRUE)] <- "type_category"
names(df) <- nms
return(df)
})
#> [[1]]
#> id type_category rating
#> 1 1 5 1
#> 2 2 6 3
#> 3 3 7 4
#>
#> [[2]]
#> id type_category rating
#> 1 9 4 2
#> 2 5 6 7
#> 3 3 1 4
Just note that you would need to assign the result back to a list.
data.table
Since you tagged data.table, this allows you to change the names in place and no extra assignment is necessary
library(data.table)
dt1 <- data.table::data.table(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
dt2 <- data.table::data.table(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
invisible(
lapply(list(dt1, dt2),
function(dt){
nms_old <- names(data.table::copy(dt))
nms_new <- data.table::copy(nms_old)
nms_new[grepl(pattern = "type_category",
x = nms_old,
ignore.case = TRUE)] <- "type_category"
data.table::setnames(dt, old = nms_old, new = nms_new)
return(NULL)
})
)
dt1
#> id type_category rating
#> 1: 1 5 1
#> 2: 2 6 3
#> 3: 3 7 4
dt2
#> id type_category rating
#> 1: 9 4 2
#> 2: 5 6 7
#> 3: 3 1 4
I have data from ACS Census that has a metadata and tables with coded column names, I need to have the column names changed to the one in the metadata so it makes sense when read. Is there a way to replace the coded column name to actual by linking the table with the metadata.
I have used dplyr package to rename the column one by one, but I cant do it for every table as there are like 32000 columns for a single table(dataframe)
Any help would be appreciated, thanks.
We can use rename and evaluate a named vector
library(dplyr)
df2 <- rename(df2, !!! set_names(df1$original, df1$new))
df2
# A B C
#1 1 6 11
#2 2 7 12
#3 3 8 13
#4 4 9 14
#5 5 10 15
data
df1 <-data.frame(original = c('a', 'b', 'c'), new = c('A', 'B', 'C'),
stringsAsFactors = FALSE)
df2 <- data.frame(a = 1:5, b =6:10, c = 11:15)
If quasiquotation is inconvenient, you can overwrite the existing names with a vector. Start with a trivial example - the band_instruments in dplyr
library(dplyr)
foo <- band_instruments
foo
# A tibble: 3 x 2
name plays
<chr> <chr>
1 John guitar
2 Paul bass
3 Keith guitar
Overwrite with a character vector
names(foo) <- c("Moniker", "Jams.On")
R > foo
# A tibble: 3 x 2
Moniker Jams.On
<chr> <chr>
1 John guitar
2 Paul bass
3 Keith guitar
So if you have a translation table, even one which has some extra info in it:
foo <- band_instruments
trTbl <- tibble(Names.Now = c("plays", "name", "shoes"),
Names.Desired = c("Jams.On", "Moniker", "boots"))
replaceVec <- tibble(Names.Now = names(band_instruments)) %>%
left_join(trTbl, by = "Names.Now") %>%
pull(Names.Desired)
names(foo) <- replaceVec
foo
# A tibble: 3 x 2
Moniker Jams.On
<chr> <chr>
1 John guitar
2 Paul bass
3 Keith guitar
You could use the good ol' match approach.
dat ## before
# X1 X4 X3 X2
# 1 1 4 7 10
# 2 2 5 8 11
# 3 3 6 9 12
names(dat) <- am$label[match(names(dat), am$code)]
dat ## after
# wage hh.size age no.children
# 1 1 4 7 10
# 2 2 5 8 11
# 3 3 6 9 12
Data:
dat <- structure(list(X1=1:3, X2=4:6, X3=7:9, X4=10:12), class="data.frame",
row.names=c(NA, -3L))
am <- structure(list(code=c("X1", "X2", "X3", "X4"),
label=c("age", "wage", "no.children", "hh.size")),
class="data.frame", row.names=c(NA, -4L))
I have a df like this
name <- c("Fred","Mark","Jen","Simon","Ed")
a_or_b <- c("a","a","b","a","b")
abc_ah_one <- c(3,5,2,4,7)
abc_bh_one <- c(5,4,1,9,8)
abc_ah_two <- c(2,1,3,7,6)
abc_bh_two <- c(3,6,8,8,5)
abc_ah_three <- c(5,4,7,6,2)
abc_bh_three <- c(9,7,2,1,4)
def_ah_one <- c(1,3,9,2,7)
def_bh_one <- c(2,8,4,6,1)
def_ah_two <- c(4,7,3,2,5)
def_bh_two <- c(5,2,9,8,3)
def_ah_three <- c(8,5,3,5,2)
def_bh_three <- c(2,7,4,3,0)
df <- data.frame(name,a_or_b,abc_ah_one,abc_bh_one,abc_ah_two,abc_bh_two,
abc_ah_three,abc_bh_three,def_ah_one,def_bh_one,
def_ah_two,def_bh_two,def_ah_three,def_bh_three)
I want to use the value in column "a_or_b" to choose the values in each of the corresponding "ah/bh" columns for each "abc" (one, two, and three), and put it into a new data frame. For example, Fred would have the values 3, 2 and 5 in his row in the new df. Those values represent the values of each of his "ah" categories for the abc columns. Jen, who has "b" in her a_or_b column, would have all of her "bh" values from her abc columns for her row in the new df. Here is what my desired output would look like:
combo_one <- c(3,5,1,4,8)
combo_two <- c(2,1,8,7,5)
combo_three <- c(5,4,2,6,4)
df2 <- data.frame(name,a_or_b,combo_one,combo_two,combo_three)
I've attempted this using sapply. The following gives me a matrix of the correct column correct indexes of df[grep("abc",colnames(df),fixed=TRUE)] for each row:
sapply(paste0(df$a_or_b,"h"),grep,colnames(df[grep("abc",colnames(df),fixed=TRUE)]))
First we gather your data into a tidy long format, then break out the columns into something useful. After that the filtering is simple, and if necessary we can convert back to an difficult wide format:
library(dplyr)
library(tidyr)
gather(df, key = "var", value = "val", -name, -a_or_b) %>%
separate(var, into = c("combo", "h", "ind"), sep = "_") %>%
mutate(h = substr(h, 1, 1)) %>%
filter(a_or_b == h, combo == "abc") %>%
arrange(name) -> result_long
result_long
# name a_or_b combo h ind val
# 1 Ed b abc b one 8
# 2 Ed b abc b two 5
# 3 Ed b abc b three 4
# 4 Fred a abc a one 3
# 5 Fred a abc a two 2
# 6 Fred a abc a three 5
# 7 Jen b abc b one 1
# 8 Jen b abc b two 8
# 9 Jen b abc b three 2
# 10 Mark a abc a one 5
# 11 Mark a abc a two 1
# 12 Mark a abc a three 4
# 13 Simon a abc a one 4
# 14 Simon a abc a two 7
# 15 Simon a abc a three 6
spread(result_long, key = ind, value = val) %>%
select(name, a_or_b, one, two, three)
# name a_or_b one two three
# 1 Ed b 8 5 4
# 2 Fred a 3 2 5
# 3 Jen b 1 8 2
# 4 Mark a 5 1 4
# 5 Simon a 4 7 6
Base R approach would be using lapply, we loop through each row of the dataframe, create a string to find similar columns using paste0 based on a_or_b column and then rbind all the values together for each row.
new_df <- do.call("rbind", lapply(seq(nrow(df)), function(x)
setNames(df[x, grepl(paste0("abc_",df[x,"a_or_b"], "h"), colnames(df))],
c("combo_one", "combo_two", "combo_three"))))
new_df
# combo_one combo_two combo_three
#1 3 2 5
#2 5 1 4
#3 1 8 2
#4 4 7 6
#5 8 5 4
We can cbind the required columns then :
cbind(df[c(1, 2)], new_df)
# name a_or_b combo_one combo_two combo_three
#1 Fred a 3 2 5
#2 Mark a 5 1 4
#3 Jen b 1 8 2
#4 Simon a 4 7 6
#5 Ed b 8 5 4
It's possible to do this with a combination of map and mutate:
require(tidyverse)
df %>%
select(name, a_or_b, starts_with("abc")) %>%
rename_if(is.numeric, funs(sub("abc_", "", .))) %>%
mutate(combo_one = map_chr(a_or_b, ~ paste0(.x,"h_one")),
combo_one = !!combo_one,
combo_two = map_chr(a_or_b, ~ paste0(.x,"h_two")),
combo_two = !!combo_two,
combo_three = map_chr(a_or_b, ~ paste0(.x,"h_three")),
combo_three = !!combo_three) %>%
select(name, a_or_b, starts_with("combo"))
Output:
name a_or_b combo_one combo_two combo_three
1 Fred a 3 2 5
2 Mark a 5 1 4
3 Jen b 1 8 2
4 Simon a 4 7 6
5 Ed b 8 5 4
I have this example: df.Journal.Conferences
venue author0 author1 author2 ... author19
A John Mary
B Peter Jacob Isabella
C Lia
B Jacob Lara John
C Mary
B Isabella
I want to know how many unique authors are in each venue
Result:
A 2
B 5
C 2
Edit:
Here is the link to my data: GoogleDrive Excel sheet.
because your data was hard to reproduce, I generated a "similar" data set,
this should word
set.seed(1984)
df <- data.frame(id = sample(1:5,10, replace= T),
v1 = sample(letters[1:5],10,replace= T),
v2 = sample(letters[1:5],10,replace= T),
v3 = sample(letters[1:5],10,replace= T),
v4 = sample(letters[1:5],10,replace= T),
stringsAsFactors = F)
z <- data.frame( id = unique(df$id), n = NA )
for (i in z$id) {
z$n[z$id == i] <- length(unique(unlist(df[df$id == i,-1])))
}
z
# id n
# 1 4 4
# 2 3 4
# 3 2 4
# 4 5 4
# 5 1 3
Using #zx8754 data for testing, this code gives want you wanted (assuming you have NA for empty cells in the dataframe):
sapply(split(df1[,-1], df1$venue), function(x) length(unique(x[!is.na(x)])))
# A B C
# 2 5 2
Using dplyr and tidyr, reshape the data from wide to long, then group by count.
library(dplyr)
library(tidyr)
gather(df1, key = author, value = name, -venue) %>%
select(venue, name) %>%
group_by(venue) %>%
summarise(n = n_distinct(name, na.rm = TRUE))
# # A tibble: 3 × 2
# venue n
# <chr> <int>
# 1 A 2
# 2 B 5
# 3 C 2
data
df1 <- read.table(text ="
venue,author0,author1,author2
A,John,Mary,NA
B,Peter,Jacob,Isabella
C,Lia,NA,NA
B,Jacob,Lara,John
C,Mary,NA,NA
B,Isabella,NA,NA
", header = TRUE, sep = ",", stringsAsFactors = FALSE)
Edit: Saved your Excel sheet as CSV, then read in using read.csv, then above code returns below output:
df1 <- read.csv("Journal_Conferences_Authors.csv", na.strings = "#N/A")
# output
# # A tibble: 427 × 2
# venue n
# <fctr> <int>
# 1 AAAI 4
# 2 ACC 4
# 3 ACIS-ICIS 5
# 4 ACM SIGSOFT Software Engineering Notes 1
# 5 ACM Southeast Regional Conference 5
# 6 ACM TIST 3
# 7 ACM Trans. Comput.-Hum. Interact. 3
# 8 ACML 2
# 9 ADMA 2
# 10 Advanced Visual Interfaces 3
# # ... with 417 more rows