Difference in values between rows after group_by - r

I want to calculate the difference in values for the following row after the previous. However, I am getting this error:
Error in mutate():
! Problem while computing ..1 = across(where(is.numeric), diff).
ℹ The error occurred in group 1: vs = 0
Caused by error in across():
! Problem while computing column mpg.
Caused by error in dplyr_internal_error():
Run rlang::last_error() to see where the error occurred.
Here is what I have tried:
mtcars %>% group_by(vs) %>% mutate(across(where(is.numeric), diff))
This seems to do the trick:
mtcars %>% group_by(vs) %>% aggregate(. ~ vs, data=., diff) %>% as.data.frame() %>% unnest()
#//--
# A tibble: 30 × 11
vs mpg cyl disp hp drat wt qsec am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 0.255 0.560 0 0 0
2 0 -2.3 2 200 65 -0.75 0.565 0 -1 -1 -2
3 0 -4.4 0 0 70 0.0600 0.130 -1.18 0 0 2
4 0 2.10 0 -84.2 -65 -0.140 0.500 1.56 0 0 -1
5 0 0.900 0 0 0 0 -0.340 0.200 0 0 0
6 0 -2.10 0 0 0 0 0.0500 0.400 0 0 0
7 0 -4.8 0 196. 25 -0.140 1.47 -0.0200 0 0 1
8 0 0 0 -12 10 0.0700 0.174 -0.160 0 0 0
9 0 4.3 0 -20 15 0.23 -0.0790 -0.400 0 0 0
10 0 0.800 0 -122 -80 -0.47 -1.82 -0.550 0 0 -2
# … with 20 more rows

You could explicitly define the calculation using lag. Or you could do this in base R:
library(tidyverse)
#tidyverse
mtcars %>%
group_by(vs) %>%
mutate(across(where(is.numeric), ~.-lag(., default = first(.)))) |>
arrange(vs)
#> # A tibble: 32 x 11
#> # Groups: vs [2]
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0 0 0 0 0 0 0 0 0 0
#> 2 0 0 0 0 0 0.255 0.560 0 0 0 0
#> 3 -2.3 2 200 65 -0.75 0.565 0 0 -1 -1 -2
#> 4 -4.4 0 0 70 0.0600 0.130 -1.18 0 0 0 2
#> 5 2.10 0 -84.2 -65 -0.140 0.500 1.56 0 0 0 -1
#> 6 0.900 0 0 0 0 -0.340 0.200 0 0 0 0
#> 7 -2.10 0 0 0 0 0.0500 0.400 0 0 0 0
#> 8 -4.8 0 196. 25 -0.140 1.47 -0.0200 0 0 0 1
#> 9 0 0 -12 10 0.0700 0.174 -0.160 0 0 0 0
#> 10 4.3 0 -20 15 0.23 -0.0790 -0.400 0 0 0 0
#> # ... with 22 more rows
#base R
by(mtcars, mtcars$vs, \(x) apply(x, 2, diff)) |>
do.call(what = rbind.data.frame)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> 0.Mazda RX4 Wag 0.0 0 0.0 0 0.00 0.255 0.56 0 0 0 0
#> 0.Hornet Sportabout -2.3 2 200.0 65 -0.75 0.565 0.00 0 -1 -1 -2
#> 0.Duster 360 -4.4 0 0.0 70 0.06 0.130 -1.18 0 0 0 2
#> 0.Merc 450SE 2.1 0 -84.2 -65 -0.14 0.500 1.56 0 0 0 -1
#> 0.Merc 450SL 0.9 0 0.0 0 0.00 -0.340 0.20 0 0 0 0
#> 0.Merc 450SLC -2.1 0 0.0 0 0.00 0.050 0.40 0 0 0 0
#> 0.Cadillac Fleetwood -4.8 0 196.2 25 -0.14 1.470 -0.02 0 0 0 1
#> 0.Lincoln Continental 0.0 0 -12.0 10 0.07 0.174 -0.16 0 0 0 0
#> 0.Chrysler Imperial 4.3 0 -20.0 15 0.23 -0.079 -0.40 0 0 0 0
#> 0.Dodge Challenger 0.8 0 -122.0 -80 -0.47 -1.825 -0.55 0 0 0 -2
#> 0.AMC Javelin -0.3 0 -14.0 0 0.39 -0.085 0.43 0 0 0 0
#> 0.Camaro Z28 -1.9 0 46.0 95 0.58 0.405 -1.89 0 0 0 2
#> 0.Pontiac Firebird 5.9 0 50.0 -70 -0.65 0.005 1.64 0 0 0 -2
#> 0.Porsche 914-2 6.8 -4 -279.7 -84 1.35 -1.705 -0.35 0 1 2 0
#> 0.Ford Pantera L -10.2 4 230.7 173 -0.21 1.030 -2.20 0 0 0 2
#> 0.Ferrari Dino 3.9 -2 -206.0 -89 -0.60 -0.400 1.00 0 0 0 2
#> 0.Maserati Bora -4.7 2 156.0 160 -0.08 0.800 -0.90 0 0 0 2
#> 1.Hornet 4 Drive -1.4 2 150.0 17 -0.77 0.895 0.83 0 -1 -1 0
#> 1.Valiant -3.3 0 -33.0 -5 -0.32 0.245 0.78 0 0 0 0
#> 1.Merc 240D 6.3 -2 -78.3 -43 0.93 -0.270 -0.22 0 0 1 1
#> 1.Merc 230 -1.6 0 -5.9 33 0.23 -0.040 2.90 0 0 0 0
#> 1.Merc 280 -3.6 2 26.8 28 0.00 0.290 -4.60 0 0 0 2
#> 1.Merc 280C -1.4 0 0.0 0 0.00 0.000 0.60 0 0 0 0
#> 1.Fiat 128 14.6 -2 -88.9 -57 0.16 -1.240 0.57 0 1 0 -3
#> 1.Honda Civic -2.0 0 -3.0 -14 0.85 -0.585 -0.95 0 0 0 1
#> 1.Toyota Corolla 3.5 0 -4.6 13 -0.71 0.220 1.38 0 0 0 -1
#> 1.Toyota Corona -12.4 0 49.0 32 -0.52 0.630 0.11 0 -1 -1 0
#> 1.Fiat X1-9 5.8 0 -41.1 -31 0.38 -0.530 -1.11 0 1 1 0
#> 1.Lotus Europa 3.1 0 16.1 47 -0.31 -0.422 -2.00 0 0 1 1
#> 1.Volvo 142E -9.0 0 25.9 -4 0.34 1.267 1.70 0 0 -1 0

Related

PCA in R: Error in svd(x, nu = 0, nv = k) : infinite or missing values in 'x'

My dataframe contains about 60 observations and 15 variables. They are mixture of continuous and binary data but I've made sure all of the variables are numeric and do not have any NA values (used na.omit). I used is.finite, is.na to check for infinite/NA values. Using the function prcomp() on my dataframe tells me "Infinite or missing values in x". What might I be overlooking then? I am new to R, just started learning so i appreciate the help.
enter image description here
Some of my columns are still characters and I am not sure how to change it, using gsub and then as.numeric still gives me an error.
library(readxl)
> Pca_for_R <- read_excel("~/Pca for R.xlsx")
sapply(Pca_for_R, as.numeric)
new <- gsub(",", "", Pca_for_R)
Mypca <- prcomp(Pca_for_R, center=TRUE, scale=TRUE)
List item
Sample of my data:
CRT GFA MRT VA Contast Myp Hyp eso exo VFV VFH
[1,] 247 2.71 1283 0.63 2.50 0 0 1 0 0 0
[2,] 226 2.06 442 1.00 1.50 0 0 0 0 0 0
[3,] 251 2.16 420 1.00 1.25 0 0 0 0 0 0
[4,] 202 3.02 282 0.80 1.25 0 1 0 0 0 0
[5,] 252 2.17 640 0.50 1.50 0 0 0 0 0 0
[6,] 260 2.25 857 0.40 1.50 0 1 0 0 1 0
[7,] 255 2.51 736 0.63 1.20 0 1 1 0 0 0
[8,] 242 1.90 353 1.00 1.20 0 0 1 0 0 0
[9,] 206 1.90 292 0.80 1.20 1 0 0 0 0 0
[10,] 515 3.04 376 0.25 1.20 0 0 0 0 1 0
[11,] 222 2.13 424 0.80 10.00 0 1 0 0 1 0
[12,] 292 1.70 326 0.50 1.25 0 1 0 0 0 0
[13,] 207 2.55 427 1.00 2.50 0 1 0 0 0 0
[14,] 242 1.89 387 0.63 1.20 0 0 0 0 0 0
[15,] 205 1.86 341 1.00 2.50 0 0 0 1 0 0
[16,] 250 3.01 728 0.40 1.20 1 0 0 0 0 0
[17,] 269 3.51 410 0.50 6.00 1 0 0 0 0 1
[18,] 271 2.17 592 0.63 1.20 1 0 0 1 0 0
[19,] 264 1.52 235 0.63 1.20 0 0 0 0 0 1
[20,] 381 4.63 628 0.80 1.25 0 0 1 0 0 0
[21,] 342 3.35 422 0.30 2.50 0 0 1 0 0 0
[22,] 219 3.75 372 0.40 1.50 1 0 1 0 0 0
[23,] 306 3.35 564 0.40 3.00 0 0 0 0 0 0
[24,] 253 3.94 592 0.63 1.50 0 1 0 1 0 0
[25,] 268 2.13 387 1.00 1.25 0 0 0 0 0 0
[26,] 346 2.16 345 0.50 2.50 0 1 0 1 0 0
[27,] 289 1.79 370 0.50 1.50 0 0 0 0 0 0
[28,] 362 1.91 616 1.00 2.50 1 0 0 0 0 1
[29,] 321 3.65 791 0.50 5.00 0 0 0 0 0 0
[30,] 497 2.64 516 0.80 5.00 0 0 0 0 0 0
[31,] 291 2.52 900 1.00 5.00 0 0 0 0 0 0
[32,] 176 2.94 376 1.00 1.20 0 1 0 1 0 0
[33,] 192 2.00 336 0.32 2.00 0 1 0 0 0 0
[34,] 207 2.05 340 1.00 1.20 0 1 0 0 0 0
[35,] 331 2.05 480 0.80 1.20 0 1 0 0 0 0
[36,] 238 2.33 550 1.00 1.50 0 1 0 0 0 0
[37,] 205 4.32 554 0.63 5.00 0 1 0 0 0 0
[38,] 300 1.55 499 1.00 2.50 1 0 0 0 0 0
[39,] 374 2.92 687 1.00 5.00 0 0 0 0 0 0
[40,] 243 3.43 735 0.40 2.50 0 0 0 1 0 0
[41,] 221 2.39 489 0.50 1.25 0 0 0 0 0 0
[42,] 177 1.88 249 1.25 1.25 0 0 0 0 0 0
[43,] 377 3.35 581 0.50 5.00 0 0 0 0 0 0
[44,] 285 2.28 459 0.30 25.00 0 0 0 0 0 0
[45,] 230 2.17 438 1.00 1.80 0 1 0 0 0 0
[46,] 183 2.34 344 1.00 1.80 0 1 1 0 1 0
[47,] 245 1.63 418 0.50 1.25 0 1 1 0 0 0
[48,] 235 1.89 514 0.60 4.00 0 0 0 0 0 0
[49,] 179 2.89 525 0.30 4.00 1 0 1 0 0 0
[50,] 187 1.47 313 0.16 5.00 0 1 0 0 0 0
[51,] 243 2.48 331 0.63 3.00 1 0 0 0 1 0
[52,] 289 1.79 370 0.80 1.50 0 0 0 0 0 0
[53,] 287 2.80 569 0.60 6.00 0 1 0 1 0 0
[54,] 271 1.61 337 0.80 1.65 0 0 1 0 0 0
[55,] 198 1.70 429 0.80 1.25 0 0 0 0 0 0
[56,] 246 2.65 516 0.50 5.00 1 0 1 0 0 0
[57,] 318 2.16 746 0.25 8.00 0 0 0 1 0 0
[58,] 238 1.61 355 0.80 1.25 0 0 0 1 0 0
[59,] 268 2.13 387 0.32 1.50 0 0 0 0 0 1
[60,] 272 2.41 406 0.80 1.25 0 1 1 0 0 0

Scatterplot with multi variables

Scatterplot reference
data set
Can someone help me create three scatter plots as in the first picture? Ideally using the plot() function.
require(tidyverse)
require(ggplot2)
df <- tibble(
image = 1:18,
m_r_exsal = rnorm(18, 5, 2),
m_r_sal = rnorm(18, 6, 2),
female = c(rep(1, 18/2), rep(0, 18/2)),
lg_salary = rnorm(18, 5, 1.5),
deviation = rnorm(18, 1, 1),
chinese = c(rep(1, 6), rep(0, 18/3*2)),
european = c(rep(0, 6), rep(1, 6), rep(0, 6)),
american = c(rep(0, 18/3*2), rep(1, 6))
)
Example data:
# A tibble: 18 x 9
image m_r_exsal m_r_sal female lg_salary deviation chinese european american
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6.19 9.56 1 5.48 1.54 1 0 0
2 2 10.1 5.17 1 3.77 -0.755 1 0 0
3 3 4.96 1.91 1 6.75 0.381 1 0 0
4 4 5.10 4.57 1 4.61 -0.207 1 0 0
5 5 -1.25 6.57 1 2.33 0.880 1 0 0
6 6 6.77 9.10 1 3.07 1.03 1 0 0
7 7 4.04 4.84 1 4.56 1.95 0 1 0
8 8 3.72 4.72 1 5.32 1.17 0 1 0
9 9 7.59 7.05 1 6.24 -0.224 0 1 0
10 10 4.09 3.94 0 5.60 2.52 0 1 0
11 11 4.15 6.05 0 7.08 -0.152 0 1 0
12 12 6.07 5.27 0 5.79 -0.323 0 1 0
13 13 4.49 4.64 0 5.97 0.457 0 0 1
14 14 6.74 4.70 0 3.38 0.377 0 0 1
15 15 7.46 9.02 0 6.65 1.85 0 0 1
16 16 4.29 5.26 0 4.07 2.18 0 0 1
17 17 2.33 1.58 0 8.43 1.06 0 0 1
18 18 4.78 8.75 0 5.03 0.101 0 0 1
Making the plot:
df %>%
mutate(chinese = case_when(chinese == 1 ~ "chinese"),
european = case_when(european == 1 ~ "european"),
american = case_when(american == 1 ~ "american"),
female = case_when(female == 1 ~ "female",
TRUE ~ "male")) %>%
unite(country, chinese:american, remove = TRUE, sep = "") %>%
mutate(country = country %>% str_remove_all("NA")) %>%
ggplot() +
aes(lg_salary, deviation, col = female) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ country)
The output:

Use group by logic with lapply function

I can use the following the function "tabyl" form the janitor package like this to apply tabyl to every column.
lapply(mtcars[,2:4],tabyl)
What I really want to do is use group by cyl and then use tabyl to those all those specified columns,something like this (does not work):
lapply(mtcars[,2:4],tabyl(cyl))
How would I put this above line into an lapply function? Or is there some other way of grouping and using a group by logic?
Please note, I have hundreds of variables in my actual data, and I want to apply tabyl to almost all the variables in my data (all the numeric at least). So I need a way of calling tabyl on them without explicitly calling on the variable names!
I want it to look like this(provided in an answer below), except I want to include MANY more variables. Imagine mtcars has 104 variables, and I want to apply this group tabyl on only the numeric ones.
cyl
4 6 8
n Percent n Percent n Percent
disp 71.1 1 9.091 0 0.00 0 0.000
75.7 1 9.091 0 0.00 0 0.000
78.7 1 9.091 0 0.00 0 0.000
79 1 9.091 0 0.00 0 0.000
95.1 1 9.091 0 0.00 0 0.000
108 1 9.091 0 0.00 0 0.000
120.1 1 9.091 0 0.00 0 0.000
120.3 1 9.091 0 0.00 0 0.000
121 1 9.091 0 0.00 0 0.000
140.8 1 9.091 0 0.00 0 0.000
145 0 0.000 1 14.29 0 0.000
146.7 1 9.091 0 0.00 0 0.000
160 0 0.000 2 28.57 0 0.000
167.6 0 0.000 2 28.57 0 0.000
225 0 0.000 1 14.29 0 0.000
258 0 0.000 1 14.29 0 0.000
275.8 0 0.000 0 0.00 3 21.429
301 0 0.000 0 0.00 1 7.143
304 0 0.000 0 0.00 1 7.143
318 0 0.000 0 0.00 1 7.143
350 0 0.000 0 0.00 1 7.143
351 0 0.000 0 0.00 1 7.143
360 0 0.000 0 0.00 2 14.286
400 0 0.000 0 0.00 1 7.143
440 0 0.000 0 0.00 1 7.143
460 0 0.000 0 0.00 1 7.143
472 0 0.000 0 0.00 1 7.143
All 11 100.000 7 100.00 14 100.000
hp 52 1 9.091 0 0.00 0 0.000
62 1 9.091 0 0.00 0 0.000
65 1 9.091 0 0.00 0 0.000
66 2 18.182 0 0.00 0 0.000
91 1 9.091 0 0.00 0 0.000
93 1 9.091 0 0.00 0 0.000
95 1 9.091 0 0.00 0 0.000
97 1 9.091 0 0.00 0 0.000
105 0 0.000 1 14.29 0 0.000
109 1 9.091 0 0.00 0 0.000
110 0 0.000 3 42.86 0 0.000
113 1 9.091 0 0.00 0 0.000
123 0 0.000 2 28.57 0 0.000
150 0 0.000 0 0.00 2 14.286
175 0 0.000 1 14.29 2 14.286
180 0 0.000 0 0.00 3 21.429
205 0 0.000 0 0.00 1 7.143
215 0 0.000 0 0.00 1 7.143
230 0 0.000 0 0.00 1 7.143
245 0 0.000 0 0.00 2 14.286
264 0 0.000 0 0.00 1 7.143
335 0 0.000 0 0.00 1 7.143
All 11 100.000 7 100.00 14 100.000
There are lots of ways to generate counts and frequencies by multiple variables. A solution with tables::tabular() enables one to display the "by group" on the column dimension, and other variables on the row dimension of a table.
We'll use the mtcars data to display disp and hp on the row dimension, and cyl on the column dimension.
library(tables)
tabular(((Factor(disp) + 1) + (Factor(hp) + 1))~(Factor(cyl))*((n=1) + Percent("col")),data = mtcars)
...and the output:
cyl
4 6 8
n Percent n Percent n Percent
disp 71.1 1 9.091 0 0.00 0 0.000
75.7 1 9.091 0 0.00 0 0.000
78.7 1 9.091 0 0.00 0 0.000
79 1 9.091 0 0.00 0 0.000
95.1 1 9.091 0 0.00 0 0.000
108 1 9.091 0 0.00 0 0.000
120.1 1 9.091 0 0.00 0 0.000
120.3 1 9.091 0 0.00 0 0.000
121 1 9.091 0 0.00 0 0.000
140.8 1 9.091 0 0.00 0 0.000
145 0 0.000 1 14.29 0 0.000
146.7 1 9.091 0 0.00 0 0.000
160 0 0.000 2 28.57 0 0.000
167.6 0 0.000 2 28.57 0 0.000
225 0 0.000 1 14.29 0 0.000
258 0 0.000 1 14.29 0 0.000
275.8 0 0.000 0 0.00 3 21.429
301 0 0.000 0 0.00 1 7.143
304 0 0.000 0 0.00 1 7.143
318 0 0.000 0 0.00 1 7.143
350 0 0.000 0 0.00 1 7.143
351 0 0.000 0 0.00 1 7.143
360 0 0.000 0 0.00 2 14.286
400 0 0.000 0 0.00 1 7.143
440 0 0.000 0 0.00 1 7.143
460 0 0.000 0 0.00 1 7.143
472 0 0.000 0 0.00 1 7.143
All 11 100.000 7 100.00 14 100.000
hp 52 1 9.091 0 0.00 0 0.000
62 1 9.091 0 0.00 0 0.000
65 1 9.091 0 0.00 0 0.000
66 2 18.182 0 0.00 0 0.000
91 1 9.091 0 0.00 0 0.000
93 1 9.091 0 0.00 0 0.000
95 1 9.091 0 0.00 0 0.000
97 1 9.091 0 0.00 0 0.000
105 0 0.000 1 14.29 0 0.000
109 1 9.091 0 0.00 0 0.000
110 0 0.000 3 42.86 0 0.000
113 1 9.091 0 0.00 0 0.000
123 0 0.000 2 28.57 0 0.000
150 0 0.000 0 0.00 2 14.286
175 0 0.000 1 14.29 2 14.286
180 0 0.000 0 0.00 3 21.429
205 0 0.000 0 0.00 1 7.143
215 0 0.000 0 0.00 1 7.143
230 0 0.000 0 0.00 1 7.143
245 0 0.000 0 0.00 2 14.286
264 0 0.000 0 0.00 1 7.143
335 0 0.000 0 0.00 1 7.143
All 11 100.000 7 100.00 14 100.000
>
UPDATE: automate the process
In the comments to my answer, the original poster asked how one might automate tabular() to avoid having to type out all the variables to be tabulated. We can do this with lapply() and an anonymous function.
Since the OP used column numbers as part of their question, we'll create a vector of columns from the mtcars data frame to be tabulated. We'll use that as the input to lapply(), along with two other arguments, one for the data frame, and another to specify the column variable in the table. Since the column variable will be a single variable, we specified it with its column name rather than a number.
# generalize and automate
varList <- 2:4
lapply(varList,function(x,df,byVar){
tabular((Factor(df[[x]],paste(colnames(df)[x])) + 1) ~ ((Factor(df[[byVar]],paste(byVar)))*((n=1) + Percent("col"))),
data= df)
},mtcars,"cyl")
The tricky part is how automating the process without the output tables having row headers of df[[x]] and column headers of df[[byVar]]. To avoid this situation, we extract the column name for the row dimension with colnames(), and we overwrite the header for the columns by pasting the byVar argument into the header.
...and the output:
[[1]]
cyl
4 6 8
cyl n Percent n Percent n Percent
4 11 100 0 0 0 0
6 0 0 7 100 0 0
8 0 0 0 0 14 100
All 11 100 7 100 14 100
[[2]]
cyl
4 6 8
disp n Percent n Percent n Percent
71.1 1 9.091 0 0.00 0 0.000
75.7 1 9.091 0 0.00 0 0.000
78.7 1 9.091 0 0.00 0 0.000
79 1 9.091 0 0.00 0 0.000
95.1 1 9.091 0 0.00 0 0.000
108 1 9.091 0 0.00 0 0.000
120.1 1 9.091 0 0.00 0 0.000
120.3 1 9.091 0 0.00 0 0.000
121 1 9.091 0 0.00 0 0.000
140.8 1 9.091 0 0.00 0 0.000
145 0 0.000 1 14.29 0 0.000
146.7 1 9.091 0 0.00 0 0.000
160 0 0.000 2 28.57 0 0.000
167.6 0 0.000 2 28.57 0 0.000
225 0 0.000 1 14.29 0 0.000
258 0 0.000 1 14.29 0 0.000
275.8 0 0.000 0 0.00 3 21.429
301 0 0.000 0 0.00 1 7.143
304 0 0.000 0 0.00 1 7.143
318 0 0.000 0 0.00 1 7.143
350 0 0.000 0 0.00 1 7.143
351 0 0.000 0 0.00 1 7.143
360 0 0.000 0 0.00 2 14.286
400 0 0.000 0 0.00 1 7.143
440 0 0.000 0 0.00 1 7.143
460 0 0.000 0 0.00 1 7.143
472 0 0.000 0 0.00 1 7.143
All 11 100.000 7 100.00 14 100.000
[[3]]
cyl
4 6 8
hp n Percent n Percent n Percent
52 1 9.091 0 0.00 0 0.000
62 1 9.091 0 0.00 0 0.000
65 1 9.091 0 0.00 0 0.000
66 2 18.182 0 0.00 0 0.000
91 1 9.091 0 0.00 0 0.000
93 1 9.091 0 0.00 0 0.000
95 1 9.091 0 0.00 0 0.000
97 1 9.091 0 0.00 0 0.000
105 0 0.000 1 14.29 0 0.000
109 1 9.091 0 0.00 0 0.000
110 0 0.000 3 42.86 0 0.000
113 1 9.091 0 0.00 0 0.000
123 0 0.000 2 28.57 0 0.000
150 0 0.000 0 0.00 2 14.286
175 0 0.000 1 14.29 2 14.286
180 0 0.000 0 0.00 3 21.429
205 0 0.000 0 0.00 1 7.143
215 0 0.000 0 0.00 1 7.143
230 0 0.000 0 0.00 1 7.143
245 0 0.000 0 0.00 2 14.286
264 0 0.000 0 0.00 1 7.143
335 0 0.000 0 0.00 1 7.143
All 11 100.000 7 100.00 14 100.000
One way is this, although I don't know if you need the cyl column:
by(mtcars[,2:4],mtcars$cyl,lapply,tabyl)
Or a tidy way, (I think the list part can be improved) :
out = mtcars[,2:4] %>%
mutate(id=cyl) %>%
group_by(id) %>% summarize_all(~list(tabyl(.)))
out
# A tibble: 3 x 4
id cyl disp hp
<dbl> <list> <list> <list>
1 4 <df[,3] [1 × 3]> <df[,3] [11 × 3]> <df[,3] [10 × 3]>
2 6 <df[,3] [1 × 3]> <df[,3] [5 × 3]> <df[,3] [4 × 3]>
3 8 <df[,3] [1 × 3]> <df[,3] [11 × 3]> <df[,3] [9 × 3]>
out %>% filter(id==4) %>% pull(hp)
[[1]]
. n percent
52 1 0.09090909
62 1 0.09090909
65 1 0.09090909
66 2 0.18181818
91 1 0.09090909
93 1 0.09090909
95 1 0.09090909
97 1 0.09090909
109 1 0.09090909
113 1 0.09090909

How to transform numbers of a variable that are readed as factor to be readed as numbers? [duplicate]

This question already has answers here:
How to convert a factor to integer\numeric without loss of information?
(12 answers)
Closed 4 years ago.
First of all, I'm new at R, I'm just learning.
I have a data frame and I want to make some plots and graphics with two variables, one of these variables is read as a factor but this variable is with real numbers. This variable is a percentage so I want to graphics this percentage related to some municipalities, how can I transform these numbers to numeric values?
I've tried this following code because in the guide I'm reading its say to convert factors to numeric with the function as.numeric() but the result is totally different numbers.
for example
#the data frame is valle.abu2
valle.abu2$Porcentaje.de.Excedencias
#then
as.numeric(valle.abu2$Porcentaje.de.Excedencias)
valle.abu2$Porcentaje.de.Excedencias
[1] 1.3 0.04 1.6 0 0 0 0.31 0.61 0 2.31 3.6 8.04 0 7.18 0 5.88 1.35 0
[19] 2.56 0 3.2 0 0 0 0 0 0.05 0.32 0 5.23 0 0 0 0 0 0
[37] 0 5.42 5.54 11.44 0 2.51 0 4.88 0 3.45 0 2.78 2.7 0 4.39 0 0 0
[55] 0 3.99 3.42 6.01 0 5.52 0 0.04 0 0.46 0.34 0 4.63 0 14.65 2.91 5.9 4.17
[73] 0 0 0 0 0 0 1.15 1.52 9.17 2.22 3.82 0 0 0 0 7.04 3.57 12.5
[91] 0 0 0 0.72 1.32 0 9.88 2.63 0 0 0 0 0 0 37.57
134 Levels: 0 0.03 0.04 0.05 0.06 0.07 0.09 0.1 0.11 0.14 0.15 0.23 0.27 0.29 0.31 0.32 0.33 0.34 0.42
as.numeric(valle.abu2$Porcentaje.de.Excedencias)
[1] 42 3 48 1 1 1 15 25 1 69 92 129 1 127 1 120 44 1 71 1 86 1 1 1 1 1 4
[28] 16 1 115 1 1 1 1 1 1 1 116 118 59 1 70 1 108 1 90 1 75 73 1 103 1 1 1
[55] 1 97 89 122 1 117 1 3 1 21 18 1 104 1 64 77 121 101 1 1 1 1 1 1 39 47 131
[82] 67 96 1 1 1 1 126 91 60 1 1 1 28 43 1 134 72 1 1 1 1 1 1 98
Try:
as_numeric_factor <- function(x){
as.numeric(levels(x))[x]
}
as_numeric_factor(valle.abu2$Porcentaje.de.Excedencias)
Explanation.
The help page ?factor section Warning includes two different ways of doing what the question asks for, and states that one of them is more efficient.
To transform a factor f to approximately its original numeric values,
as.numeric(levels(f))[f] is recommended and slightly more efficient than
as.numeric(as.character(f)).
Here is a simple speed test. set.seed is not needed since the result of interest are the timings, not computations.
library(microbenchmark)
library(ggplot2)
as_numeric_factor2 <- function(x){
as.numeric(as.character(x))
}
f <- factor(rnorm(1e4))
mb <- microbenchmark(
levl = as_numeric_factor(f),
char = as_numeric_factor2(f)
)
autoplot(mb)

Reading in columns without clear separators with read.table

Hello I'm loading a data file which is formated as a table separated with multispaces. Ordinarily it is easily loaded via read.table(data_file, sep = "", header = T, fill = T), but some values are not divided with spaces in case they are negative:
523.2 -166.1 1.62 0.079 0.0 0.0 0.0 2260 0
528.4 -168.6 -0.71-0.034 0.0 0.0 0.0 2284 0
533.9 -169.7 -1.75-0.085 0.0 0.0 0.0 2308 0
538.4 -169.5 -1.60-0.078 0.0 0.0 0.0 2333 0
543.3 -170.8 -2.83-0.137 0.0 0.0 0.0 2357 0
548.2 -171.8 -3.77-0.183 0.0 0.0 0.0 2381 0
552.8 -172.1 -3.87-0.187 0.0 0.0 0.0 2406 0
554.9 -172.5 -4.23-0.205 0.0 0.0 0.0 2430 0
Then whole part eg -3.77-0.183 is taken as a single value.
What is convenient way to cope with this without preliminary file conversion using other scripts.
Thanks in advance!
One way would be:
lines <- readLines("datN.txt") #read your data using `readLines`
lines1 <- gsub("(?<=[0-9])((-|\\s)[0-9]+)", " \\1", lines, perl=TRUE)
dat <- read.table(text=lines1, sep="", header=FALSE)
dat
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 523.2 -166.1 1.62 0.079 0 0 0 2260 0
#2 528.4 -168.6 -0.71 -0.034 0 0 0 2284 0
#3 533.9 -169.7 -1.75 -0.085 0 0 0 2308 0
#4 538.4 -169.5 -1.60 -0.078 0 0 0 2333 0
#5 543.3 -170.8 -2.83 -0.137 0 0 0 2357 0
#6 548.2 -171.8 -3.77 -0.183 0 0 0 2381 0
#7 552.8 -172.1 -3.87 -0.187 0 0 0 2406 0
#8 554.9 -172.5 -4.23 -0.205 0 0 0 2430 0
str(dat)
#'data.frame': 8 obs. of 9 variables:
#$ V1: num 523 528 534 538 543 ...
#$ V2: num -166 -169 -170 -170 -171 ...
#$ V3: num 1.62 -0.71 -1.75 -1.6 -2.83 -3.77 -3.87 -4.23
#$ V4: num 0.079 -0.034 -0.085 -0.078 -0.137 -0.183 -0.187 -0.205
#$ V5: num 0 0 0 0 0 0 0 0
#$ V6: num 0 0 0 0 0 0 0 0
#$ V7: num 0 0 0 0 0 0 0 0
#$ V8: int 2260 2284 2308 2333 2357 2381 2406 2430
#$ V9: int 0 0 0 0 0 0 0 0
If it is a well-formatted (from a fixed-field perspective), then:
data <- read.fwf("fixed.dat", widths = c(6, 9, 10, 6, 12, 9, 9, 7, 9))
data
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 523.2 -166.1 1.62 0.079 0 0 0 2260 0
## 2 528.4 -168.6 -0.71 -0.034 0 0 0 2284 0
## 3 533.9 -169.7 -1.75 -0.085 0 0 0 2308 0
## 4 538.4 -169.5 -1.60 -0.078 0 0 0 2333 0
## 5 543.3 -170.8 -2.83 -0.137 0 0 0 2357 0
## 6 548.2 -171.8 -3.77 -0.183 0 0 0 2381 0
## 7 552.8 -172.1 -3.87 -0.187 0 0 0 2406 0
## 8 554.9 -172.5 -4.23 -0.205 0 0 0 2430 0
might work.

Resources