Transpose every n columns into new rows in R - r

I have a data frame that looks like this
Frame RightEye_x RightEye_y RightEye_z LeftEye_x LeftEye_y LeftEye_z
0 773 490 0 778 322 0
1 780 490 0 789 334 0
2 781 490 0 792 334 0
3 783 337 0 797 334 1
And I would like to transform it into
BodyPart Frame x y z
RightEye 0 773 490 0
RightEye 1 780 490 0
RightEye 2 781 490 0
RightEye 3 783 337 0
LeftEye 0 778 322 0
LeftEye 1 789 334 0
LeftEye 2 792 334 0
LeftEye 3 797 334 1

Using the melt(...) method in data.table:
library(data.table)
setDT(df)
result <- melt(df, measure.vars = patterns(c('_x', '_y', '_z')), value.name = c('x', 'y', 'z'))
result[, variable:=c('RightEye', 'LeftEye')[variable]]
result
## Frame variable x y z
## 1: 0 RightEye 773 490 0
## 2: 1 RightEye 780 490 0
## 3: 2 RightEye 781 490 0
## 4: 3 RightEye 783 337 0
## 5: 0 LeftEye 778 322 0
## 6: 1 LeftEye 789 334 0
## 7: 2 LeftEye 792 334 0
## 8: 3 LeftEye 797 334 1

We can use base R reshape like below
reshape(
setNames(df, gsub("(.*)_(.*)", "\\2_\\1", names(df))),
direction = "long",
idvar = "Frame",
varying = -1,
timevar = "BodyPart",
sep = "_"
)
which gives
Frame BodyPart x y z
0.RightEye 0 RightEye 773 490 0
1.RightEye 1 RightEye 780 490 0
2.RightEye 2 RightEye 781 490 0
3.RightEye 3 RightEye 783 337 0
0.LeftEye 0 LeftEye 778 322 0
1.LeftEye 1 LeftEye 789 334 0
2.LeftEye 2 LeftEye 792 334 0
3.LeftEye 3 LeftEye 797 334 1

Related

Subsetting nested lists based on condition (values) in R

I have a large nested list (list of named lists) - the example of such a list is given below. I would like to create a new list, in which only sub-lists with "co" vectors containing both 0 and 1 values would be preserved, while 0-only sublists would be discarded (eg. the output should contain only first-, third- and fourth- subgroups.
I played with lapply and filter according to this thread:
Subset elements in a list based on a logical condition
However, it throwed errors. I would appreciate tips how to handle lists within the lists.
# reprex
set.seed(123)
## empty lists
first_group <- list()
second_group <- list()
third_group <- list()
fourth_group <- list()
# dummy_vecs
values1 <- c(sample(120:730, 30, replace=TRUE))
coeff1 <- c(sample(0:1, 30, replace=TRUE))
values2 <- c(sample(50:810, 43, replace=TRUE))
coeff2 <- c(rep(0, 43))
values3 <- c(sample(510:730, 57, replace=TRUE))
coeff3 <- c(rep(0, 8), rep(1, 4), rep(0, 45))
values4 <- c(sample(123:770, 28, replace=TRUE))
coeff4 <- c(sample(0:1, 28, replace=TRUE))
## fill lists with values:
first_group[["val"]] <- values1
first_group[["co"]] <- coeff1
second_group[["val"]] <- values2
second_group[["co"]] <- coeff2
third_group[["val"]] <- values3
third_group[["co"]] <- coeff3
fourth_group[["val"]] <- values4
fourth_group[["co"]] <- coeff4
#concatenate lists:
dummy_list <- list()
dummy_list[["first-group"]] <- first_group
dummy_list[["second-group"]] <- second_group
dummy_list[["third-group"]] <- third_group
dummy_list[["fourth-group"]] <- fourth_group
rm(values1, values2, values3, values4, coeff1, coeff2, coeff3, coeff4, first_group, second_group, third_group, fourth_group)
gc()
#show list
print(dummy_list)
# create boolean for where condition is TRUE
cond <- sapply(dummy_list, function(x) any(0 %in% x$co) & any(1 %in% x$co))
# subset
dummy_list[cond]
You could use Filter from base R:
Filter(function(x) sum(x$co) !=0, dummy_list)
Or you can use purrr:
library(tidyverse)
dummy_list %>%
keep( ~ sum(.$co) != 0)
Output
$`first-group`
$`first-group`$val
[1] 534 582 298 645 314 237 418 348 363 133 493 721 722 210 467 474 145 638 545 330 709 712 674 492 262 663 609 142 428 254
$`first-group`$co
[1] 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0
$`third-group`
$`third-group`$val
[1] 713 721 683 526 699 555 563 672 619 603 588 533 622 724 616 644 730 716 660 663 611 669 644 664 679 514 579 525 533 541 530 564 584 673 592 726 548 563 727
[40] 646 708 557 586 592 693 620 548 705 510 677 539 603 726 525 597 563 712
$`third-group`$co
[1] 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$`fourth-group`
$`fourth-group`$val
[1] 142 317 286 174 656 299 676 206 645 755 514 424 719 741 711 552 550 372 551 520 650 503 667 162 644 595 322 247
$`fourth-group`$co
[1] 0 0 0 0 1 0 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1
However, if you also want to exclude any co that have all 1s, then we can add an extra condition.
Filter(function(x) sum(x$co) !=0 & sum(x$co == 0) > 0, dummy_list)
purrr
dummy_list %>%
keep( ~ sum(.$co) != 0 & sum(.$co == 0) > 0)

Transform single row into rows and columns

I have a list of 170 items, each with 12 variables. This data is currently organised in one continuous row (1 observations of 2040 variables), e.g.:
0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
but I want it to be organised into 170 columns with 12 rows as follows:
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
0 1 2
I have tried the following:
list2=lapply(list1, function(x) t(x))
but this doesn't alter the data in any way. Is there something else I can do to transform the data?
We convert the string to a vector of numeric elements with scan, split the vector by itself to create a list and convert it to a data.frame
v2 <- scan(text=v1, what=numeric(), quiet=TRUE)
data.frame(split(v2, v2))
If your data is already converted into a vector (as #akrun showed with using scan) you could also do:
data <- 1:2040 # your data
breaks <- seq(1, 2040, 170)
result <- lapply(breaks, function(x) data[x : (x + 169)])
Results in
> str(result)
List of 12
$ : int [1:170] 1 2 3 4 5 6 7 8 9 10 ...
$ : int [1:170] 171 172 173 174 175 176 177 178 179 180 ...
$ : int [1:170] 341 342 343 344 345 346 347 348 349 350 ...
$ : int [1:170] 511 512 513 514 515 516 517 518 519 520 ...
$ : int [1:170] 681 682 683 684 685 686 687 688 689 690 ...
$ : int [1:170] 851 852 853 854 855 856 857 858 859 860 ...
$ : int [1:170] 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 ...
$ : int [1:170] 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 ...
$ : int [1:170] 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 ...
$ : int [1:170] 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 ...
$ : int [1:170] 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 ...
$ : int [1:170] 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 ...

Join on two separate columns in a data.frame with a simple code

I need a simple code to join on two separate columns from the same key data frame.
My first data.frame looks like this:
> head(data)
bikeid end.station.id start.station.id diff.time stoptime starttime
4 15941 259 315 564 2014-09-02 17:59:30 2014-09-02 18:08:54
8 15941 229 450 2616 2014-09-09 09:28:39 2014-09-09 10:12:15
9 15941 477 465 3223 2014-09-09 15:59:23 2014-09-09 16:53:06
10 15941 319 147 570 2014-09-09 18:55:44 2014-09-09 19:05:14
14 15941 3002 304 3208 2014-09-12 14:54:10 2014-09-12 15:47:38
19 15941 469 267 2514 2014-09-25 16:13:24 2014-09-25 16:55:18
midtime
4 2014-09-02 18:04:12
8 2014-09-09 09:50:27
9 2014-09-09 16:26:14
10 2014-09-09 19:00:29
14 2014-09-12 15:20:54
19 2014-09-25 16:34:21
I need to join end.station.id and start.station.id on citibike_station_id in the data.frame stations
> head(stations)
id citibike_station_id latitude longitude label
1 1 72 40.76727 -73.99393 W 52 St & 11 Ave
2 2 79 40.71912 -74.00667 Franklin St & W Broadway
3 3 82 40.71117 -74.00017 St James Pl & Pearl St
4 4 83 40.68383 -73.97632 Atlantic Ave & Fort Greene Pl
5 5 116 40.74178 -74.00150 W 17 St & 8 Ave
6 6 119 40.69609 -73.97803 Park Ave & St Edwards St
So that the end result is a data frame that looks like this, obviously with no zeros.
> head(data)
bikeid end.station.id start.station.id diff.time stoptime starttime
4 15941 259 315 564 2014-09-02 17:59:30 2014-09-02 18:08:54
8 15941 229 450 2616 2014-09-09 09:28:39 2014-09-09 10:12:15
9 15941 477 465 3223 2014-09-09 15:59:23 2014-09-09 16:53:06
10 15941 319 147 570 2014-09-09 18:55:44 2014-09-09 19:05:14
14 15941 3002 304 3208 2014-09-12 14:54:10 2014-09-12 15:47:38
19 15941 469 267 2514 2014-09-25 16:13:24 2014-09-25 16:55:18
midtime end.station.lat end.station.lon end.station.name start.station.lat
4 2014-09-02 18:04:12 0 0 0 0
8 2014-09-09 09:50:27 0 0 0 0
9 2014-09-09 16:26:14 0 0 0 0
10 2014-09-09 19:00:29 0 0 0 0
14 2014-09-12 15:20:54 0 0 0 0
19 2014-09-25 16:34:21 0 0 0 0
start.station.lon start.station.name
4 0 0
8 0 0
9 0 0
10 0 0
14 0 0
19 0 0
Try this:
data <- data.frame(end.station.id = c(1,2,3), start.station.id = c(3,1,2))
stations <- data.frame(citibike_station_id = c(1,2,3), label = c("One", "Two", "Three"))
data <- merge(data, stations, by.x = "start.station.id", by.y = "citibike_station_id", all.x = TRUE)
data <- merge(data, stations, by.x = "end.station.id", by.y = "citibike_station_id", all.x = TRUE)
names(data)[3:4] <- c("start", "end")

Difficulties applying pca

I am experimenting pca with R. I have the following data:
V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
2454 0 168 290 45 1715 61 551 245 30 91
222 188 94 105 60 3374 615 7 294 0 169
552 0 0 465 0 3040 0 0 771 0 0
2872 0 0 0 0 3380 0 289 0 0 0
2938 0 56 56 0 2039 538 311 113 0 254
2849 0 0 332 0 2548 0 332 0 0 221
3102 0 0 0 0 2690 0 0 0 807 807
3134 0 0 0 0 2897 289 144 144 144 0
558 0 0 0 0 3453 0 0 0 0 0
2893 0 262 175 0 2452 350 1138 262 87 175
552 0 0 351 0 3114 0 0 678 0 0
2874 0 109 54 0 2565 272 1037 109 0 0
1396 0 0 407 0 1730 0 0 305 0 0
2866 0 71 179 0 2403 358 753 35 107 143
449 0 0 0 0 2825 0 0 0 0 0
2888 0 0 523 0 2615 104 627 209 0 0
2537 0 57 0 0 1854 0 0 463 0 0
2873 0 0 342 0 3196 0 114 0 0 114
720 0 0 365 4 2704 0 4 643 4 0
218 125 31 94 219 2479 722 0 219 0 94
to which I apply the following code:
fit <- prcomp(data)
ev <- fit$rotation # pc loadings
In order to make some tests, I tried to see the data matrix I retrieve when I do keep all the components I can keep:
numberComponentsKept = 10
featureVector = ev[,1:numberComponentsKept]
newData <- as.matrix(data)%*%as.matrix(featureVector)
The newData matrix should be the same as the original one, but instead, I get a very different result:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
2454 1424.447 867.5986 514.0592 -155.4783720 -574.7425 85.38724 -86.71887 90.872507 4.305168 92.08284
222 3139.681 1020.4150 376.3165 471.8718398 -796.9549 142.14301 -119.86945 32.919950 -31.269467 32.55846
552 2851.544 539.6075 883.3969 -93.3579153 -908.6689 68.34030 -40.97052 -13.856931 23.133566 89.00851
2872 3111.317 1210.0187 433.0382 -144.4065362 -381.2305 -20.08927 -49.03447 9.569258 44.201571 70.13113
2938 1788.334 945.8162 189.6526 308.7703509 -593.5577 124.88484 -109.67276 -115.127348 14.170615 99.19492
2849 2291.839 978.1819 374.7567 -243.6739292 -496.8707 287.01065 -126.22501 -18.747873 54.080763 62.80605
3102 2530.989 814.7548 -510.5978 -410.6295894 -1015.3228 46.85727 -21.20662 14.696831 23.687923 72.37691
3134 2679.430 970.1323 311.8627 124.2884480 -536.4490 -26.23858 83.86768 -17.808390 -28.802387 92.09583
558 3268.599 988.2515 353.6538 -82.9155988 -342.5729 12.96219 -60.94886 18.537087 7.291126 96.14917
2893 1921.761 1664.0084 631.0800 -55.6321469 -864.9628 -28.11045 -104.78931 37.797727 -12.078535 104.88374
552 2927.108 607.6489 799.9602 -79.5494412 -827.6994 14.14625 -50.12209 -14.020936 29.996639 86.72887
2874 2084.285 1636.7999 621.6383 -49.2934502 -577.4815 -67.27198 -11.06071 -7.167577 47.395309 51.02962
1396 1618.171 337.4320 488.2717 -100.1663625 -469.8857 212.37199 -1.19409 13.531485 -23.332701 64.58806
2866 2007.261 1387.6890 395.1586 0.8640971 -636.1243 133.41074 12.34794 -26.969634 5.506828 74.13767
449 2674.136 808.5174 289.3345 -67.8356695 -280.2689 10.60475 -49.86404 15.165731 5.965083 78.66244
2888 2254.171 1162.4988 749.7230 -206.0215007 -652.2364 302.36320 40.76341 -1.079259 17.635956 57.86999
2537 1747.098 371.8884 429.1309 9.3761544 -480.7130 -196.25019 -81.31580 2.819608 24.089379 56.91885
2873 2973.872 974.3854 433.7282 -197.0601947 -478.3647 301.96576 -81.81105 14.516646 -1.191972 100.79057
720 2537.535 504.4124 744.5909 -78.1162036 -771.1396 38.17725 -36.61446 -9.079443 25.488688 78.21597
218 2292.718 800.5257 260.6641 603.3295960 -641.9296 187.38913 11.71382 70.011487 78.047216 96.10967
What did I do wrong?
I think the problem is rather a PCA problem than an R problem. You multiply the original data with the rotation matrix and you wonder then why newData!=data. This would be only the case if the rotation matrix would be the identity matrix.
What you probably were planning to do is the following:
# Run PCA:
fit <- prcomp(USArrests)
ev <- fit$rotation # pc loadings
# Reversed PCA:
head(fit$x%*% t(as.matrix(ev)))
# Centered Original data:
head(t(apply(USArrests,1,'-',colMeans(USArrests))))
In the last step you have to center the data, because the function prcomp centers them by default.

Convert data frame from wide to long with 2 variables

I have the following wide data frame (mydf.wide):
DAY JAN F1 FEB F2 MAR F3 APR F4 MAY F5 JUN F6 JUL F7 AUG F8 SEP F9 OCT F10 NOV F11 DEC F12
1 169 0 296 0 1095 0 599 0 1361 0 1746 0 2411 0 2516 0 1614 0 908 0 488 0 209 0
2 193 0 554 0 1085 0 1820 0 1723 0 2787 0 2548 0 1402 0 1633 0 897 0 411 0 250 0
3 246 0 533 0 1111 0 1817 0 2238 0 2747 0 1575 0 1912 0 705 0 813 0 156 0 164 0
4 222 0 547 0 1125 0 1789 0 2181 0 2309 0 1569 0 1798 0 1463 0 878 0 241 0 230 0
I want to produce the following "semi-long":
DAY variable_month value_month value_F
1 JAN 169 0
I tried:
library(reshape2)
mydf.long <- melt(mydf.wide, id.vars=c("YEAR","DAY"), measure.vars=c("JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"))
but this skip the F variable and I don't know how to deal with two variables...
This is one of those cases where reshape(...) in base R is a better option.
months <- c(2,4,6,8,10,12,14,16,18,20,22,24) # column numbers of months
F <- c(3,5,7,9,11,13,15,17,19,21,23,25) # column numbers of Fn
mydf.long <- reshape(mydf.wide,idvar=1,
times=colnames(mydf.wide)[months],
varying=list(months,F),
v.names=c("value_month","value_F"),
direction="long")
colnames(mydf.long)[2] <- "variable_month"
head(mydf.long)
# DAY variable_month value_month value_F
# 1.JAN 1 JAN 169 0
# 2.JAN 2 JAN 193 0
# 3.JAN 3 JAN 246 0
# 4.JAN 4 JAN 222 0
# 1.FEB 1 FEB 296 0
# 2.FEB 2 FEB 554 0
You can also do this with 2 calls to melt(...)
library(reshape2)
months <- c(2,4,6,8,10,12,14,16,18,20,22,24) # column numbers of months
F <- c(3,5,7,9,11,13,15,17,19,21,23,25) # column numbers of Fn
z.1 <- melt(mydf.wide,id=1,measure=months,
variable.name="variable_month",value.name="value_month")
z.2 <- melt(mydf.wide,id=1,measure=F,value.name="value_F")
mydf.long <- cbind(z.1,value_F=z.2$value_F)
head(mydf.long)
# DAY variable_month value_month z.2$value_F
# 1 1 JAN 169 0
# 2 2 JAN 193 0
# 3 3 JAN 246 0
# 4 4 JAN 222 0
# 5 1 FEB 296 0
# 6 2 FEB 554 0
melt() and dcast() are available from the reshape2 and data.table packages. The recent versions of data.table allow to melt multiple columns simultaneously. The patterns() parameter can be used to specify the two sets of columns by regular expressions:
library(data.table) # CRAN version 1.10.4 used
regex_month <- toupper(paste(month.abb, collapse = "|"))
mydf.long <- melt(setDT(mydf.wide), measure.vars = patterns(regex_month, "F\\d"),
value.name = c("MONTH", "F"))
# rename factor levels
mydf.long[, variable := forcats::lvls_revalue(variable, toupper(month.abb))][]
DAY variable MONTH F
1: 1 JAN 169 0
2: 2 JAN 193 0
3: 3 JAN 246 0
4: 4 JAN 222 0
5: 1 FEB 296 0
...
44: 4 NOV 241 0
45: 1 DEC 209 0
46: 2 DEC 250 0
47: 3 DEC 164 0
48: 4 DEC 230 0
DAY variable MONTH F
Note that "F\\d" is used as regular expression in patterns(). A simple "F" would have catched FEB as well as F1, F2, etc. producing unexpected results.
Also note that mydf.wide needs to be coerced to a data.table object. Otherwise, reshape2::melt() will be dispatched on a data.frame object which doesn't recognize patterns().
Data
library(data.table)
mydf.wide <- fread(
"DAY JAN F1 FEB F2 MAR F3 APR F4 MAY F5 JUN F6 JUL F7 AUG F8 SEP F9 OCT F10 NOV F11 DEC F12
1 169 0 296 0 1095 0 599 0 1361 0 1746 0 2411 0 2516 0 1614 0 908 0 488 0 209 0
2 193 0 554 0 1085 0 1820 0 1723 0 2787 0 2548 0 1402 0 1633 0 897 0 411 0 250 0
3 246 0 533 0 1111 0 1817 0 2238 0 2747 0 1575 0 1912 0 705 0 813 0 156 0 164 0
4 222 0 547 0 1125 0 1789 0 2181 0 2309 0 1569 0 1798 0 1463 0 878 0 241 0 230 0",
data.table = FALSE)

Resources