string split in r at specific position [duplicate] - r

I have a dataframe where the date and time are mixed like this:
ID <- c(1,2,3,4)
DDMMYY <-c(100310,110310,120310,130310)
HHMM <- c(2205,1045,1110,2250)
df <- data.frame(ID,DDMMYY,HHMM)
df
ID DDMMYY HHMM
1 100310 2205
2 110310 1045
3 120310 1110
4 130310 2250
I want to split the date and time so that DD, MM, YY, HH and MM fall into separate columns like this:
ID DD MM YY HH MM
1 10 3 10 22 5
2 11 3 10 10 45
3 12 3 10 11 10
4 13 3 10 22 50
Any idea? Thanks

One option would be to use extract from tidyr
library(tidyr)
extract(extract(df, DDMMYY, c("DD","MM", "YY"), "(..)(..)(..)",
convert=TRUE), HHMM, c("HH", "MM"), "(..)(..)", convert=TRUE)
# ID DD MM YY HH MM
#1 1 10 3 10 22 5
#2 2 11 3 10 10 45
#3 3 12 3 10 11 10
#4 4 13 3 10 22 50
Or you could use strsplit from base R
df[,c("DD", "MM", "YY", "HH", "MM")] <- do.call(data.frame,lapply(df[,-1],
function(x) do.call(rbind,lapply(strsplit(as.character(x),
'(?<=..)(?=..)', perl=TRUE), as.numeric))))
df[,-(2:3)]
# ID DD MM YY HH MM.1
#1 1 10 3 10 22 5
#2 2 11 3 10 10 45
#3 3 12 3 10 11 10
#4 4 13 3 10 22 50

For fun, here are three more alternatives:
Convert your data to actual dates and use format
within(df, {
var <- paste(DDMMYY, HHMM)
var <- strptime(var, format = "%d%m%y %H%M")
mm <- format(var, "%M")
hh <- format(var, "%H")
YY <- format(var, "%y")
MM <- format(var, "%m")
DD <- format(var, "%d")
rm(var, DDMMYY, HHMM)
})
# ID DD MM YY hh mm
# 1 1 10 03 10 22 05
# 2 2 11 03 10 10 45
# 3 3 12 03 10 11 10
# 4 4 13 03 10 22 50
Use read.fwf (and a bunch of other nested stuff)
cbind(df[1],
setNames(do.call(
cbind, lapply(df[-1], function(x) {
read.fwf(textConnection(as.character(x)),
widths = rep(2, nchar(x[1])/2))
})),
c("DD", "MM", "YY", "hh", "mm")))
# ID DD MM YY hh mm
# 1 1 10 3 10 22 5
# 2 2 11 3 10 10 45
# 3 3 12 3 10 11 10
# 4 4 13 3 10 22 50
Use separate from "tidyr"
library(dplyr)
library(tidyr)
df %>%
separate(DDMMYY, into = c("DD", "MM", "YY"), sep = c(2, 4)) %>%
separate(HHMM, into = c("hh", "mm"), sep = 2)
# ID DD MM YY hh mm
# 1 1 10 03 10 22 05
# 2 2 11 03 10 10 45
# 3 3 12 03 10 11 10
# 4 4 13 03 10 22 50

Related

Having a subset of a data set based on a specific condition

I have a data set which the values of "age" has different units (days, months, year). I want to convert the rows which their values are based on days and months to year. How I can do it in R?
If there is no letter after the number, then the unit is years.
If there is a ‘D’ after the number, then the unit is days (e.g. 10D means 10 days)
If there is an ‘M’ after the number, then the unit is months (e.g. 5M means 5 months).
Age <- c("33","32","44","54M","67M","34D","33D","44","77","88M","49 D","55D","11M")
ID <- c(1,2,3,4,5,6,7,8,9,10,11,12,13)
Data <- data.frame(ID,Age)
> Data
ID Age
1 1 33
2 2 32
3 3 44
4 4 54M
5 5 67M
6 6 34D
7 7 33D
8 8 44
9 9 77
10 10 88M
11 11 49 D
12 12 55D
13 13 11M
Here's a quick way in base R:
Data$units = ifelse(grepl("M", Data$Age), "month", ifelse(grepl("D", Data$Age), "day", "year"))
Data$value = as.numeric(gsub(pattern = "[^0-9]", replacement = "", Data$Age))
Data$result = with(Data,
ifelse(units == "year", value,
ifelse(units == "month", value / 12, value / 365.25)))
Data
# ID Age units value result
# 1 1 33 year 33 33.00000000
# 2 2 32 year 32 32.00000000
# 3 3 44 year 44 44.00000000
# 4 4 54M month 54 4.50000000
# 5 5 67M month 67 5.58333333
# 6 6 34D day 34 0.09308693
# 7 7 33D day 33 0.09034908
# 8 8 44 year 44 44.00000000
# 9 9 77 year 77 77.00000000
# 10 10 88M month 88 7.33333333
# 11 11 49 D day 49 0.13415469
# 12 12 55D day 55 0.15058179
# 13 13 11M month 11 0.91666667
And here's another option using tidyverse tools:
library(dplyr)
library(stringr)
Data %>%
mutate(Unit = str_extract(string = Age,pattern = "[DM]"),
Unit = if_else(is.na(Unit),'Y',Unit),
Age = as.numeric(gsub(pattern = "[MD]","",Age))) %>%
mutate(AgeYears = Age / c('Y' = 1,'M' = 12,'D' = 365)[Unit])
ID Age Unit AgeYears
1 1 33 Y 33.00000000
2 2 32 Y 32.00000000
3 3 44 Y 44.00000000
4 4 54 M 4.50000000
5 5 67 M 5.58333333
6 6 34 D 0.09315068
7 7 33 D 0.09041096
8 8 44 Y 44.00000000
9 9 77 Y 77.00000000
10 10 88 M 7.33333333
11 11 49 D 0.13424658
12 12 55 D 0.15068493
13 13 11 M 0.91666667
#baseR
Age <-c("33","32","44","54M","67M","34D","33D","44","77","88M","49 D","55D","11M")
AgeNum<- as.numeric(sub("\\s*\\D$","",Age))
Age[grepl("M$",Age)] <- AgeNum[grepl("M$",Age)]/12
Age[grepl("D$",Age)] <- AgeNum[grepl("D$",Age)]/365
Age <- as.numeric(Age)
result:
> Age
[1] 33.00000000 32.00000000 44.00000000 4.50000000 5.58333333 0.09315068 0.09041096 44.00000000
[9] 77.00000000 7.33333333 0.13424658 0.15068493 0.91666667
>
Additionally, a further solution using data.table:
> library(data.table)
> dt <- data.table(ID, Age)
> dt[, Unit := ifelse(grepl("D$", Age), "D", ifelse(grepl("M$", Age), "M", "Y"))][
, Age := as.integer(gsub("M|D", "", Age))]
> dt[, Age_in_years := ifelse(Unit == "Y", Age,
ifelse(Unit == "M", Age / 12, Age / 365.25))][]
ID Age Unit Age_in_years
1: 1 33 Y 33.00000000
2: 2 32 Y 32.00000000
3: 3 44 Y 44.00000000
4: 4 54 M 4.50000000
5: 5 67 M 5.58333333
6: 6 34 D 0.09308693
7: 7 33 D 0.09034908
8: 8 44 Y 44.00000000
9: 9 77 Y 77.00000000
10: 10 88 M 7.33333333
11: 11 49 D 0.13415469
12: 12 55 D 0.15058179
13: 13 11 M 0.91666667

Reshape/gather function to create dataset ready for multilevel analysis

I have a big dataset, with 240 cases representing 240 patients. They all have undergone neuropsychological tests and filled in questionnaires. Additionally, their significant others (hereafter: proxies) have also filled in questionnaires. Since 'patient' and 'proxy' are nested in 'couples', I want to conduct a multilevel analysis in R. For this, I need to reshape my dataset to run those kind of analysis.
Simply said, I want to 'duplicate' my rows. For the double subject IDs add a new variable with 1s and 2s, where 1 stands for patient data and 2 stands for proxy data. Then I want the rows to be filled with 1. all the patient data and the columns that contain the proxy data to be NA or empty or whatever, and 2. all the proxy data, and all the patient data NA or empty.
Let's say this is my data:
id <- c(1:5)
names <- c('id', 'p1', 'p2', 'p3', 'pr1', 'pr2', 'pr3')
p1 <- c(sample(1:10, 5))
p2 <- c(sample(10:20, 5))
p3 <- c(sample(20:30, 5))
pr1 <- c(sample(1:10, 5))
pr2 <- c(sample(10:20, 5))
pr3 <- c(sample(20:30, 5))
mydf <- as.data.frame(matrix(c(id, p1, p2, p3, pr1, pr2, pr3), nrow = 5))
colnames(mydf) <- names
>mydf
id p1 p2 p3 pr1 pr2 pr3
1 1 6 20 22 1 10 24
2 2 8 11 24 2 18 29
3 3 7 10 25 6 20 26
4 4 3 14 20 10 15 20
5 5 5 19 29 7 14 22
I want my data finally to look like this:
id2 <- rep(c(1:5), each = 2)
names2 <- c('id', 'couple', 'q1', 'q2', 'q3')
couple <- rep(1:2, 5)
p1 <- c(sample(1:10, 5))
p2 <- c(sample(10:20, 5))
p3 <- c(sample(20:30, 5))
pr1 <- c(sample(1:10, 5))
pr2 <- c(sample(10:20, 5))
pr3 <- c(sample(20:30, 5))
mydf <- as.data.frame(matrix(c(id2, couple, p1, p2, p3, pr1, pr2, pr3), nrow = 10, ncol = 5))
colnames(mydf) <- names2
>mydf
id couple q1 q2 q3
1 1 1 6 23 16
2 1 2 10 28 10
3 2 1 1 27 14
4 2 2 7 21 20
5 3 1 5 30 18
6 3 2 12 2 27
7 4 1 10 1 25
8 4 2 13 7 21
9 5 1 11 6 20
10 5 2 18 3 23
Or, if this is not possible, like this:
id couple bb1 bb2 bb3 pbb1 pbb2 pbb3
1 1 1 6 23 16
2 1 2 10 28 10
3 2 1 1 27 14
4 2 2 7 21 20
5 3 1 5 30 18
6 3 2 12 2 27
7 4 1 10 1 25
8 4 2 13 7 21
9 5 1 11 6 20
10 5 2 18 3 23
Now, to get me there, i've tried the melt() function and the gather() function and it feels like i'm close but still it's not working the way I want it to work.
note, in my dataset the variable names are bb1:bb54 for the patient questionnaire and pbb1:pbb54 for the proxy questionnaire
Example of what I've tried
df_long <- df_reshape %>%
gather(testname, value, -(bb1:bb11), -(pbb1:pbb11), -id, -pgebdat, -p_age, na.rm=T) %>%
arrange(id)
If I understand what you want correctly, you can gather everything to a very long form and then reshape back to a slightly wider form:
library(tidyverse)
set.seed(47) # for reproducibility
mydf <- data.frame(id = c(1:5),
p1 = c(sample(1:10, 5)),
p2 = c(sample(10:20, 5)),
p3 = c(sample(20:30, 5)),
pr1 = c(sample(1:10, 5)),
pr2 = c(sample(10:20, 5)),
pr3 = c(sample(20:30, 5)))
mydf_long <- mydf %>%
gather(var, val, -id) %>%
separate(var, c('couple', 'q'), -2) %>%
mutate(q = paste0('q', q)) %>%
spread(q, val)
mydf_long
#> id couple q1 q2 q3
#> 1 1 p 10 17 21
#> 2 1 pr 10 11 24
#> 3 2 p 4 13 27
#> 4 2 pr 4 15 20
#> 5 3 p 7 14 30
#> 6 3 pr 1 14 29
#> 7 4 p 6 18 24
#> 8 4 pr 8 20 30
#> 9 5 p 9 16 23
#> 10 5 pr 3 18 25
One approach would be to use unite and separate in tidyr, along with the gather function as well.
I'm using your mydf data frame since it was provided, but it should be pretty straightforward to make any changes:
mydf %>%
unite(p1:p3, col = `1`, sep = ";") %>% # Combine responses of 'p1' through 'p3'
unite(pr1:pr3, col = `2`, sep = ";") %>% # Combine responses of 'pr1' through 'pr3'
gather(couple, value, `1`:`2`) %>% # Form into long data
separate(value, sep = ";", into = c("q1", "q2", "q3"), convert = TRUE) %>% # Separate and retrieve original answers
arrange(id)
Which gives you:
id couple q1 q2 q3
1 1 1 9 18 25
2 1 2 10 18 30
3 2 1 1 11 29
4 2 2 2 15 29
5 3 1 10 19 26
6 3 2 3 19 25
7 4 1 7 10 23
8 4 2 1 20 28
9 5 1 6 16 21
10 5 2 5 12 26
Our numbers are different since they were all randomly generated with sample.
Edited per #alistaire comment: add convert = TRUE to the separate call to make sure the responses are still of class integer.

selecting middle n rows in R

I have a data.table in R say df.
row.number <- c(1:20)
a <- c(rep("A", 10), rep("B", 10))
b <- c(sample(c(0:100), 20, replace = TRUE))
df <-data.table(row.number,a,b)
df
row.number a b
1 1 A 14
2 2 A 59
3 3 A 39
4 4 A 22
5 5 A 75
6 6 A 89
7 7 A 11
8 8 A 88
9 9 A 22
10 10 A 6
11 11 B 37
12 12 B 42
13 13 B 39
14 14 B 8
15 15 B 74
16 16 B 67
17 17 B 18
18 18 B 12
19 19 B 56
20 20 B 21
I want to take the 'n' rows , (say 10) from the middle after arranging the records in increasing order of column b.
Use setorder to sort and .N to filter:
setorder(df, b)[(.N/2 - 10/2):(.N/2 + 10/2 - 1), ]
row.number a b
1: 11 B 36
2: 5 A 38
3: 8 A 41
4: 18 B 43
5: 1 A 50
6: 12 B 51
7: 15 B 54
8: 3 A 55
9: 20 B 59
10: 4 A 60
You could use the following code
library(data.table)
set.seed(9876) # for reproducibility
# your data
row.number <- c(1:20)
a <- c(rep("A", 10), rep("B", 10))
b <- c(sample(c(0:100), 20, replace = TRUE))
df <- data.table(row.number,a,b)
df
# define how many to select and store in n
n <- 10
# calculate how many to cut off at start and end
n_not <- (nrow(df) - n )/2
# use data.tables setorder to arrange based on column b
setorder(df, b)
# select the rows wanted based on n
df[ (n_not+1):(nr-n_not), ]
Please let me know whether this is what you want.

multiple line on same plot for timeseries

Below table has 366 days of data:
od
month dayofmonth total ad aont
1 1 1 27 9 18
2 1 2 31 24 7
3 1 3 30 25 5
4 1 4 29 15 14
5 1 5 27 1 26
6 1 6 30 18 12
7 1 7 31 8 23
8 1 8 30 9 21
9 1 9 25 23 2
10 1 10 31 15 16
11 1 11 27 17 7
12 1 12 27 3 24
13 1 13 26 11 15
14 1 14 28 12
library(zoo)
require(xts)
Dates <- seq(as.Date(f, "%Y - %m - %d"), as.Date(t, "%Y - %m - %d"), "day")
total<- xts(od$total, order.by = Dates)
dont<- xts(od$ad, order.by = Dates)
adont<- xts(od$aont, order.by = Dates)
I used zoo package now I want to draw multiple lines in same plot
Using plot.type="single" in the plot call can help with this.
#open libraries
library(xts)
library(zoo)
#set some random variables
a=rnorm(100)
b=rnorm(100)
#and some time series
c=seq(as.Date("2000/1/1"), by = "week", length.out = 100)
d=cbind(a,b)
#make it into an zoo object
d=as.zoo(d, order.by=c)
plot.zoo(d,
plot.type = "single",
col = c("red", "blue"))
Create the zoo object and then plot:
library(ggplot2)
library(zoo)
z <- zoo(od[3:5], as.Date(paste(2012, od$month, od$dayofmonth, sep = "-")))
autoplot(z, facet = NULL)

Combine two dataframes one above the other

I have two dataframes and I want to put one above the other "with" column names of second as a row of the new dataframe. Column names are different and one dataframe has more columns.
For example:
mydf1 <- data.frame(V1=c(1:5), V2=c(21:25))
mydf1
V1 V2
1 1 21
2 2 22
3 3 23
4 4 24
5 5 25
mydf2 <- data.frame(C1=c(1:10), C2=c(21:30),C3=c(41:50))
mydf2
C1 C2 C3
1 1 21 41
2 2 22 42
3 3 23 43
4 4 24 44
5 5 25 45
6 6 26 46
7 7 27 47
8 8 28 48
9 9 29 49
10 10 30 50
Result:
mydf
V1 V2
1 1 21 NA
2 2 22 NA
3 3 23 NA
4 4 24 NA
5 5 25 NA
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
I dont care if all numeric values treated like characters.
Many thanks
You can do this easily without any packages:
mydf1 <- data.frame(V1=c(1:5), V2=c(21:25))
mydf1[,3] <- NA
names(mydf1) <- c("one", "two", "three")
mydf2 <- data.frame(C1=c(1:10), C2=c(21:30),C3=c(41:50))
names <- t(as.data.frame(names(mydf2)))
names <- as.data.frame(names)
names(mydf2) <- c("one", "two", "three")
names(names) <- c("one", "two", "three")
mydf3 <- rbind(mydf1, names)
mydf4 <- rbind(mydf3, mydf2)
> mydf4
one two three
1 1 21 <NA>
2 2 22 <NA>
3 3 23 <NA>
4 4 24 <NA>
5 5 25 <NA>
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
>
Of course, you can edit the <- c("one", "two", "three") to make the final column names whatever you'd like. For example:
> mydf1 <- data.frame(V1=c(1:5), V2=c(21:25))
> mydf1[,3] <- NA
> names(mydf1) <- c("V1", "V2", "NA")
> mydf2 <- data.frame(C1=c(1:10), C2=c(21:30),C3=c(41:50))
> names <- t(as.data.frame(names(mydf2)))
> names <- as.data.frame(names)
> names(mydf2) <- c("V1", "V2", "NA")
> names(names) <- c("V1", "V2", "NA")
> mydf3 <- rbind(mydf1, names)
> mydf4 <- rbind(mydf3, mydf2)
> row.names(mydf4) <- NULL
> mydf4
V1 V2 NA
1 1 21 <NA>
2 2 22 <NA>
3 3 23 <NA>
4 4 24 <NA>
5 5 25 <NA>
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
If you need to resort a package for any reason when scaling this up to your real use case, then try melt from reshape2 or the package plyr. However, use of a package shouldn't be necessary.
I don't know what you tried with write.table, but that seems to me like the way to go.
I would create a function something like this:
myFun <- function(...) {
L <- list(...)
temp <- tempfile()
maxCol <- max(vapply(L, ncol, 1L))
lapply(L, function(x)
suppressWarnings(
write.table(x, file = temp, row.names = FALSE,
sep = ",", append = TRUE)))
read.csv(temp, header = FALSE, fill = TRUE,
col.names = paste0("New_", sequence(maxCol)),
stringsAsFactors = FALSE)
}
Usage would then simply be:
myFun(mydf1, mydf2)
# New_1 New_2 New_3
# 1 V1 V2
# 2 1 21
# 3 2 22
# 4 3 23
# 5 4 24
# 6 5 25
# 7 C1 C2 C3
# 8 1 21 41
# 9 2 22 42
# 10 3 23 43
# 11 4 24 44
# 12 5 25 45
# 13 6 26 46
# 14 7 27 47
# 15 8 28 48
# 16 9 29 49
# 17 10 30 50
The function is written such that you can specify more than two data.frames as input:
mydf3 <- data.frame(matrix(1:8, ncol = 4))
myFun(mydf1, mydf2, mydf3)
# New_1 New_2 New_3 New_4
# 1 V1 V2
# 2 1 21
# 3 2 22
# 4 3 23
# 5 4 24
# 6 5 25
# 7 C1 C2 C3
# 8 1 21 41
# 9 2 22 42
# 10 3 23 43
# 11 4 24 44
# 12 5 25 45
# 13 6 26 46
# 14 7 27 47
# 15 8 28 48
# 16 9 29 49
# 17 10 30 50
# 18 X1 X2 X3 X4
# 19 1 3 5 7
# 20 2 4 6 8
Here's one approach with the rbind.fill function (part of the plyr package).
library(plyr)
setNames(rbind.fill(setNames(mydf1, names(mydf2[seq(mydf1)])),
rbind(names(mydf2), mydf2)), names(mydf1))
V1 V2 NA
1 1 21 <NA>
2 2 22 <NA>
3 3 23 <NA>
4 4 24 <NA>
5 5 25 <NA>
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
Give this a try.
Assign the column names from the second data set to a vector, and then replace the second set's names with the names from the first set. Then create a list where the middle element is the vector you assigned. Now when you call rbind, it should be fine since everything is in the right order.
d1$V3 <- NA
nm <- names(d2)
names(d2) <- names(d1)
dc <- do.call(rbind, list(d1,nm,d2))
rownames(dc) <- NULL
dc

Resources