Add ID column to a list of data frames - r

I have a list of 142 dataframes file_content and a list from id_list <- list(as.character(1:length(file_content)))
I am trying to add a new column period to each data frame in file_content.
All data frames are similar to 2021-03-16 below.
`2021-03-16` <- file_content[[1]] # take a look at 1/142 dataframes in file_content
head(`2021-03-16`)
author_id created_at id tweet
1 3.304380e+09 2018-12-01 22:58:55+00:00 1.069003e+18 #Acosta I hope he didn’t really say “muckâ€\u009d.
2 5.291559e+08 2018-12-01 22:57:31+00:00 1.069003e+18 #Acosta I like Mattis, but why does he only speak this way when Individual-1 isn't around?
3 2.195313e+09 2018-12-01 22:56:41+00:00 1.069002e+18 #Acosta What did Mattis say about the informal conversation between Trump and Putin at the G20?
4 3.704188e+07 2018-12-01 22:56:41+00:00 1.069002e+18 #Acosta Good! Tree huggers be damned!
5 1.068995e+18 2018-12-01 22:56:11+00:00 1.069002e+18 #Acosta #NinerMBA_01
6 9.983321e+17 2018-12-01 22:55:13+00:00 1.069002e+18 #Acosta Really?
I have tried to add the period column using the following code but it adds all 142 values from the id_list to every row in every data frame in file_content.
for (id in length(id_list)) {
file_content <- lapply(file_content, function(x) { x$period <- paste(id_list[id], sep = "_"); x })
}

You were close, the mistake is you need double brackets in id_list[[id]].
for (id in length(id_list)) {
file_content <- lapply(file_content, function(x) {
x$period <- paste(id_list[[id]], sep = "_")
x
})
}
# $`1`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`2`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`3`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
You could also try Map() and save a few lines.
Map(`[<-`, file_content, 'period', value=id_list)
# $`1`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`2`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`3`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
Data:
file_content <- replicate(3, data.frame(matrix(1:12, 3, 4)), simplify=F) |> setNames(1:3)
id_list <- list(as.character(1:length(file_content)))

We may use imap
library(purrr)
library(dplyr)
imap(file_content, ~ .x %>%
mutate(period = .y))
Or with Map from base R
Map(cbind, file_content, period = names(file_content))
In the OP's code, the id_list is created as a single list element by wrapping with list i.e.
list(1:5)
vs
as.list(1:5)
Here, we don't need to convert to list as a vector is enough
id_list <- seq_along(file_content)
Also, the for loop is looping on a single element i.e. the last element with length
for (id in length(id_list)) {
^^
instead, it would be 1:length. In addition, the assignment should be on the single list element file_content[[id]] and not on the entire list
for(id in seq_along(id_list)) {
file_content[[id]]$period <- id_list[id]
}

Related

Rename multiple columns with series index using dplyr in R

My data frame looks like this
X0 <- c(11,2,3,4)
X1 <- c(10,2,3,4)
X2 <- c(8,2,3,4)
X3 <- c(4,6,3,4)
test <- data.frame(X0,X1,X2,X3)
X0 X1 X2 X3
1 11 10 8 4
2 2 2 2 6
3 3 3 3 3
4 4 4 4 4
I would like to rename the first three columns using the character "t" and the series from 1:3.
I want my data frame to look like this
t0 t1 t2 X3
1 11 10 8 4
2 2 2 2 6
3 3 3 3 3
4 4 4 4 4
EDIT
It works like this
test %>%
rename_at(vars(X0:X2), list(~paste0("t", 0:2)))
Or using rename_with
library(dplyr)
library(stringr)
test %>%
rename_with(~ str_c('t', 0:2), X0:X2)
Here is a data.table option with setnames
setnames(setDT(test),1:3,function(v) gsub("X","t",v))

R - Subset dataframe to include only subjects with more than 1 record

I'd like to subset a dataframe to include all records for subjects that have >1 record, and exclude those subjects with only 1 record.
Let's take the following dataframe;
mydata <- data.frame(subject_id = factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10)),
variable = rnorm(15))
The code below gives me the subjects with >1 record using duplicated();
duplicates <- mydata[duplicated(mydata$subject_id),]$subject_id
But I want to retain in my subset all records for each subject with >1 record, so I tried;
mydata[mydata$subject_id==as.factor(duplicates),]
Which does not return the result I'm expecting.
Any ideas?
A data.table solution
set.seed(20)
subject_id <- as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
variable <- rnorm(15)
mydata<-as.data.frame(cbind(subject_id, variable))
library(data.table)
setDT(mydata)[, .SD[.N > 1], by = subject_id] # #Thanks David.
# subject_id variable
# 1: 4 -1.3325937
# 2: 4 -0.4465668
# 3: 5 0.5696061
# 4: 5 -2.8897176
# 5: 6 -0.8690183
# 6: 6 -0.4617027
# 7: 9 -0.1503822
# 8: 9 -0.6281268
# 9: 9 1.3232209
A simple alternative is to use dplyr:
library(dplyr)
dfr <- data.frame(a=sample(1:2,10,rep=T), b=sample(1:5,10, rep=T))
dfr <- group_by(dfr, b)
dfr
# Source: local data frame [10 x 2]
# Groups: b
#
# a b
# 1 2 4
# 2 2 2
# 3 2 5
# 4 2 1
# 5 1 2
# 6 1 3
# 7 2 1
# 8 2 4
# 9 1 4
# 10 2 4
filter(dfr, n() > 1)
# Source: local data frame [8 x 2]
# Groups: b
#
# a b
# 1 2 4
# 2 2 2
# 3 2 1
# 4 1 2
# 5 2 1
# 6 2 4
# 7 1 4
# 8 2 4
Here you go (I changed your variable to var <- rnorm(15):
set.seed(11)
subject_id<-as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
var<-rnorm(15)
mydata<-as.data.frame(cbind(subject_id,var))
x1 <- c(names(table(mydata$subject_id)[table(mydata$subject_id) > 1]))
x2 <- which(mydata$subject_id %in% x1)
mydata[x2,]
subject_id var
4 4 0.3951076
5 4 -2.4129058
6 5 -1.3309979
7 5 -1.7354382
8 6 0.4020871
9 6 0.4628287
12 9 -2.1744466
13 9 0.4857337
14 9 1.0245632
Try:
> mydata[mydata$subject_id %in% mydata[duplicated(mydata$subject_id),]$subject_id,]
subject_id variable
4 4 -1.3325937
5 4 -0.4465668
6 5 0.5696061
7 5 -2.8897176
8 6 -0.8690183
9 6 -0.4617027
12 9 -0.1503822
13 9 -0.6281268
14 9 1.3232209
I had to edit your data frame a little bit:
set.seed(20)
subject_id <- as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
variable <- rnorm(15)
mydata<-as.data.frame(cbind(subject_id, variable))
Now to get all the rows for subjects that appear more than once:
mydata[duplicated(mydata$subject_id)
| duplicated(mydata$subject_id, fromLast = TRUE), ]
# subject_id variable
# 4 4 -1.3325937
# 5 4 -0.4465668
# 6 5 0.5696061
# 7 5 -2.8897176
# 8 6 -0.8690183
# 9 6 -0.4617027
# 12 9 -0.1503822
# 13 9 -0.6281268
# 14 9 1.3232209
Edit: this would also work, using your duplicates vector:
mydata[mydata$subject_id %in% duplicates, ]

Parsing Delimited Data In a DataFrame Into Separate Columns in R

I have a data frame which looks as such
A B C
1 3 X1=7;X2=8;X3=9
2 4 X1=10;X2=11;X3=12
5 6 X1=13;X2=14
I would like to parse the C column into separate columns as such...
A B X1 X2 X3
1 3 7 8 9
2 4 10 11 12
5 6 13 14 NA
How would one go about doing this in R?
First, here's the sample data in data.frame form
dd<-data.frame(
A = c(1L, 2L, 5L),
B = c(3L, 4L, 6L),
C = c("X1=7;X2=8;X3=9",
"X1=10;X2=11;X3=12", "X1=13;X2=14"),
stringsAsFactors=F
)
Now I define a small helper function to take vectors like c("A=1","B=2") and changed them into named vectors like c(A="1", B="2").
namev<-function(x) {
a<-strsplit(x,"=")
setNames(sapply(a,'[',2), sapply(a,'[',1))
}
and now I perform the transformations
#turn each row into a named vector
vv<-lapply(strsplit(dd$C,";"), namev)
#find list of all column names
nm<-unique(unlist(sapply(vv, names)))
#extract data from all rows for every column
nv<-do.call(rbind, lapply(vv, '[', nm))
#convert everything to numeric (optional)
class(nv)<-"numeric"
#rejoin with original data
cbind(dd[,-3], nv)
and that gives you
A B X1 X2 X3
1 1 3 7 8 9
2 2 4 10 11 12
3 5 6 13 14 NA
My cSplit function makes solving problems like these fun. Here it is in action:
## Load some packages
library(data.table)
library(devtools) ## Just for source_gist, really
library(reshape2)
## Load `cSplit`
source_gist("https://gist.github.com/mrdwab/11380733")
First, split your values up and create a "long" dataset:
ddL <- cSplit(cSplit(dd, "C", ";", "long"), "C", "=")
ddL
# A B C_1 C_2
# 1: 1 3 X1 7
# 2: 1 3 X2 8
# 3: 1 3 X3 9
# 4: 2 4 X1 10
# 5: 2 4 X2 11
# 6: 2 4 X3 12
# 7: 5 6 X1 13
# 8: 5 6 X2 14
Next, use dcast.data.table (or just dcast) to go from "long" to "wide":
dcast.data.table(ddL, A + B ~ C_1, value.var="C_2")
# A B X1 X2 X3
# 1: 1 3 7 8 9
# 2: 2 4 10 11 12
# 3: 5 6 13 14 NA
Here's one possible approach:
dat <- read.table(text="A B C
1 3 X1=7;X2=8;X3=9
2 4 X1=10;X2=11;X3=12
5 6 X1=13;X2=14", header=TRUE, stringsAsFactors = FALSE)
library(qdapTools)
dat_C <- strsplit(dat$C, ";")
dat_C2 <- sapply(dat_C, function(x) {
y <- strsplit(x, "=")
rep(sapply(y, "[", 1), as.numeric(sapply(y, "[", 2)))
})
data.frame(dat[, -3], mtabulate(dat_C2))
## A B X1 X2 X3
## 1 1 3 7 8 9
## 2 2 4 10 11 12
## 3 5 6 13 14 0
EDIT To obtain the NA values
m <- mtabulate(dat_C2)
m[m==0] <- NA
data.frame(dat[, -3], m)
Here's a nice, somewhat hacky way to get you there.
## read your data
> dat <- read.table(h=T, text = "A B C
1 3 X1=7;X2=8;X3=9
2 4 X1=10;X2=11;X3=12
5 6 X1=13;X2=14", stringsAsFactors = FALSE)
## ---
> s <- strsplit(dat$C, ";|=")
> xx <- unique(unlist(s)[grepl('[A-Z]', unlist(s))])
> sap <- t(sapply(seq(s), function(i){
wh <- which(!xx %in% s[[i]]); n <- suppressWarnings(as.numeric(s[[i]]))
nn <- n[!is.na(n)]; if(length(wh)){ append(nn, NA, wh-1) } else { nn }
})) ## see below for explanation
> data.frame(dat[1:2], sap)
# A B X1 X2 X3
# 1 1 3 7 8 9
# 2 2 4 10 11 12
# 3 5 6 13 14 NA
Basically what's happening in sap is
check which values are missing
change each list element of s to numeric
remove the NA values from (2)
insert NA into the correct position with append
transpose the result

Keep columns of a data frame based on a data frame

I have a data frame, called df, which contains 4000 values. I have a list of 1000 column numbers, in a data frame called list, which is 1000 rows by 1 column. How can I keep the rows with the numbers in list in the data frame df and throw the rest out. I already tried using:
listv <- as.vector(list)
and then using
dfnew <- df[,listv]
but I get the error
Error in .subset(x, j) : invalid subscript type 'list'
You're mixing up rows and columns subsetting. Here is a minimal example:
df <- data.frame(matrix(1:21, ncol = 3))
df
# X1 X2 X3
# 1 1 8 15
# 2 2 9 16
# 3 3 10 17
# 4 4 11 18
# 5 5 12 19
# 6 6 13 20
# 7 7 14 21
list <- data.frame(V1 = c(1, 4, 6))
list
# V1
# 1 1
# 2 4
# 3 6
df[list[, 1], ]
# X1 X2 X3
# 1 1 8 15
# 4 4 11 18
# 6 6 13 20
df[unlist(list), ]
# X1 X2 X3
# 1 1 8 15
# 4 4 11 18
# 6 6 13 20
Note also that as.vector(list) doesn't create a vector, as you thought it would. You need unlist here (as I used in the last example).

Performing calculations on binned counts in R

I have a dataset stored in a text file in the format of bins of values followed by counts, like this:
var_a 1:5 5:12 7:9 9:14 ...
indicating that var_a took on the value 1 5 times in the dataset, 5 12 times, etc. Each variable is on its own line in that format.
I'd like to be able to perform calculations on this dataset in R, like quantiles, variance, and so on. Is there an easy way to load the data from the file and calculate these statistics? Ultimately I'd like to make a box-and-whisker plot for each variable.
Cheers!
You could use readLines to read in the data file
.x <- readLines(datafile)
I will create some dummy data, as I don't have the file. This should be the equivalent of the output of readLines
## dummy
.x <- c("var_a 1:5 5:12 7:9 9:14", 'var_b 1:5 2:12 3:9 4:14')
I split by spacing to get each
#split by space
space_split <- strsplit(.x, ' ')
# get the variable names (first in each list)
variable_names <- lapply(space_split,'[[',1)
# get the variable contents (everything but the first element in each list)
variable_contents <- lapply(space_split,'[',-1)
# a function to do the appropriate replicates
do_rep <- function(x){rep.int(x[1],x[2])}
# recreate the variables
variables <- lapply(variable_contents, function(x){
.list <- strsplit(x, ':')
unlist(lapply(lapply(.list, as.numeric), do_rep))
})
names(variables) <- variable_names
you could get the variance for each variable using
lapply(variables, var)
## $var_a
## [1] 6.848718
##
## $var_b
## [1] 1.138462
or get boxplots
boxplot(variables, ~.)
Not knowing the actual form that your data is in, I would probably use something like readLines to get each line in as a vector, then do something like the following:
# Some sample data
temp = c("var_a 1:5 5:12 7:9 9:14",
"var_b 1:7 4:9 3:11 2:10",
"var_c 2:5 5:14 6:6 3:14")
# Extract the names
NAMES = gsub("[0-9: ]", "", temp)
# Extract the data
temp_1 = strsplit(temp, " |:")
temp_1 = lapply(temp_1, function(x) as.numeric(x[-1]))
# "Expand" the data
temp_1 = lapply(1:length(temp_1),
function(x) rep(temp_1[[x]][seq(1, length(temp_1[[x]]), by=2)],
temp_1[[x]][seq(2, length(temp_1[[x]]), by=2)]))
names(temp_1) = NAMES
temp_1
# $var_a
# [1] 1 1 1 1 1 5 5 5 5 5 5 5 5 5 5 5 5 7 7 7 7 7 7 7 7 7 9 9 9 9 9 9 9 9 9 9 9 9 9 9
#
# $var_b
# [1] 1 1 1 1 1 1 1 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2
#
# $var_c
# [1] 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3

Resources