hth1 is a data frame that I already have.
> hth1
Source: local data frame [13 x 14]
Groups: team [13]
team CSK DC DD GL KKR KTK KXIP MI PW RCB RPSG
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSK 0 8 11 0 11 2 9 10 4 10 0
2 DC 2 0 8 0 2 1 7 5 3 8 0
3 DD 5 3 0 0 7 2 8 5 2 10 2
4 GL 0 0 2 0 0 0 0 0 0 1 0
5 KKR 5 7 10 2 0 0 5 10 3 15 0
6 KTK 0 0 0 0 2 0 1 0 1 2 0
7 KXIP 8 3 10 2 14 0 0 11 2 6 1
8 MI 12 5 13 2 8 1 7 0 3 11 1
9 PW 2 1 4 0 2 0 4 3 0 1 0
10 RCB 9 3 7 2 3 0 12 8 4 0 1
11 RPSG 0 0 0 2 2 0 1 1 0 1 0
12 RR 8 2 7 0 14 1 7 6 2 7 0
13 SH 3 0 4 0 5 0 4 5 2 5 2
# ... with 2 more variables: RR <dbl>, SH <dbl>
Why do the data frame returned by bind_rows() and the original data frame differ?
> h <- list(hth1)
> hth_b1 <- bind_rows(h)
> identical(hth1, hth_b1)
[1] FALSE
> class(hth_b1)
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
> class(hth1)
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
> setequal(hth1, hth_b1)
TRUE
> anti_join(hth1, hth_b1)
Joining, by = c("team", "CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH")
Source: local data frame [0 x 14]
Groups: team [13]
# ... with 14 variables: team <chr>, CSK <dbl>, DC <dbl>, DD <dbl>, GL <dbl>,
# KKR <dbl>, KTK <dbl>, KXIP <dbl>, MI <dbl>, PW <dbl>, RCB <dbl>,
# RPSG <dbl>, RR <dbl>, SH <dbl>
What am I missing? I have been stuck here for a long time.
Update 1:
As requested by Benjamin, I dput() function on both dataframes. Here is the output.
> dput(hth_b1)
structure(list(team = c("CSK", "DC", "DD", "GL", "KKR", "KTK",
"KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH"), CSK = c(0, 2,
5, 0, 5, 0, 8, 12, 2, 9, 0, 8, 3), DC = c(8, 0, 3, 0, 7, 0, 3,
5, 1, 3, 0, 2, 0), DD = c(11, 8, 0, 2, 10, 0, 10, 13, 4, 7, 0,
7, 4), GL = c(0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0), KKR = c(11,
2, 7, 0, 0, 2, 14, 8, 2, 3, 2, 14, 5), KTK = c(2, 1, 2, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0), KXIP = c(9, 7, 8, 0, 5, 1, 0, 7, 4,
12, 1, 7, 4), MI = c(10, 5, 5, 0, 10, 0, 11, 0, 3, 8, 1, 6, 5
), PW = c(4, 3, 2, 0, 3, 1, 2, 3, 0, 4, 0, 2, 2), RCB = c(10,
8, 10, 1, 15, 2, 6, 11, 1, 0, 1, 7, 5), RPSG = c(0, 0, 2, 0,
0, 0, 1, 1, 0, 1, 0, 0, 2), RR = c(9, 7, 9, 0, 1, 1, 8, 10, 3,
9, 0, 0, 7), SH = c(3, 0, 4, 3, 4, 0, 4, 3, 0, 4, 0, 0, 0)), .Names = c("team",
"CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB",
"RPSG", "RR", "SH"), row.names = c(NA, -13L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = list(team), indices = list(
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI",
"PW", "RCB", "RPSG", "RR", "SH")), row.names = c(NA, -13L
), class = "data.frame", vars = list(team), .Names = "team"))
>
> dput(hth1)
structure(list(team = c("CSK", "DC", "DD", "GL", "KKR", "KTK",
"KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH"), CSK = c(0, 2,
5, 0, 5, 0, 8, 12, 2, 9, 0, 8, 3), DC = c(8, 0, 3, 0, 7, 0, 3,
5, 1, 3, 0, 2, 0), DD = c(11, 8, 0, 2, 10, 0, 10, 13, 4, 7, 0,
7, 4), GL = c(0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0), KKR = c(11,
2, 7, 0, 0, 2, 14, 8, 2, 3, 2, 14, 5), KTK = c(2, 1, 2, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0), KXIP = c(9, 7, 8, 0, 5, 1, 0, 7, 4,
12, 1, 7, 4), MI = c(10, 5, 5, 0, 10, 0, 11, 0, 3, 8, 1, 6, 5
), PW = c(4, 3, 2, 0, 3, 1, 2, 3, 0, 4, 0, 2, 2), RCB = c(10,
8, 10, 1, 15, 2, 6, 11, 1, 0, 1, 7, 5), RPSG = c(0, 0, 2, 0,
0, 0, 1, 1, 0, 1, 0, 0, 2), RR = c(9, 7, 9, 0, 1, 1, 8, 10, 3,
9, 0, 0, 7), SH = c(3, 0, 4, 3, 4, 0, 4, 3, 0, 4, 0, 0, 0)), .Names = c("team",
"CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB",
"RPSG", "RR", "SH"), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -13L), vars = list(team), labels = structure(list(
team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI",
"PW", "RCB", "RPSG", "RR", "SH")), class = "data.frame", row.names = c(NA,
-13L), vars = list(team), drop = TRUE, .Names = "team"), indices = list(
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), drop = TRUE, group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L)
There is a difference in the output for both of them, there is an extra drop = TRUE for hth1.
I don't understand why it is not there in the other one.
A reproducible example:
library(tidyverse)
test1 <- mtcars %>% group_by(cyl)
test2 <- bind_rows(list(test1))
identical(test1, test2) #FALSE
all_equal(test1, test2) #TRUE
You can check both their attributes and you can see the rownames differ:
rownames(test1)
[1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
[4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
[7] "Duster 360" "Merc 240D" "Merc 230"
[10] "Merc 280" "Merc 280C" "Merc 450SE"
[13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
[16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
[19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
[22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
[25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
[28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
[31] "Maserati Bora" "Volvo 142E"
rownames(test2)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13"
[14] "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
[27] "27" "28" "29" "30" "31" "32"
Never expect tibbles to treat your rownames with respect, they may be silently dropped at any time.
Forgive the formatting on this answer, but it would appear that you have labels attached to one object, and not in the other. Where the labels got attached or removed isn't something I can know without looking at code that generates the objects. I've bolded the difference in your objects below.
Note: not formatting this as code is a deliberate choice. Formatting as code prevents me from marking the difference in the structure in bold text
dput(hth_b1)
structure(list(team = c("CSK", "DC", "DD", "GL", "KKR", "KTK",
"KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH"), CSK = c(0, 2,
5, 0, 5, 0, 8, 12, 2, 9, 0, 8, 3), DC = c(8, 0, 3, 0, 7, 0, 3,
5, 1, 3, 0, 2, 0), DD = c(11, 8, 0, 2, 10, 0, 10, 13, 4, 7, 0,
7, 4), GL = c(0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0), KKR = c(11,
2, 7, 0, 0, 2, 14, 8, 2, 3, 2, 14, 5), KTK = c(2, 1, 2, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0), KXIP = c(9, 7, 8, 0, 5, 1, 0, 7, 4,
12, 1, 7, 4), MI = c(10, 5, 5, 0, 10, 0, 11, 0, 3, 8, 1, 6, 5
), PW = c(4, 3, 2, 0, 3, 1, 2, 3, 0, 4, 0, 2, 2), RCB = c(10,
8, 10, 1, 15, 2, 6, 11, 1, 0, 1, 7, 5), RPSG = c(0, 0, 2, 0,
0, 0, 1, 1, 0, 1, 0, 0, 2), RR = c(9, 7, 9, 0, 1, 1, 8, 10, 3,
9, 0, 0, 7), SH = c(3, 0, 4, 3, 4, 0, 4, 3, 0, 4, 0, 0, 0)), .Names = c("team",
"CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB",
"RPSG", "RR", "SH"), row.names = c(NA, -13L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = list(team), indices = list(
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L , labels = structure(list(
team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI",
"PW", "RCB", "RPSG", "RR", "SH")), row.names = c(NA, -13L
), class = "data.frame", vars = list(team), .Names = "team"))
dput(hth1)
structure(list(team = c("CSK", "DC", "DD", "GL", "KKR", "KTK",
"KXIP", "MI", "PW", "RCB", "RPSG", "RR", "SH"), CSK = c(0, 2,
5, 0, 5, 0, 8, 12, 2, 9, 0, 8, 3), DC = c(8, 0, 3, 0, 7, 0, 3,
5, 1, 3, 0, 2, 0), DD = c(11, 8, 0, 2, 10, 0, 10, 13, 4, 7, 0,
7, 4), GL = c(0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0), KKR = c(11,
2, 7, 0, 0, 2, 14, 8, 2, 3, 2, 14, 5), KTK = c(2, 1, 2, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0), KXIP = c(9, 7, 8, 0, 5, 1, 0, 7, 4,
12, 1, 7, 4), MI = c(10, 5, 5, 0, 10, 0, 11, 0, 3, 8, 1, 6, 5
), PW = c(4, 3, 2, 0, 3, 1, 2, 3, 0, 4, 0, 2, 2), RCB = c(10,
8, 10, 1, 15, 2, 6, 11, 1, 0, 1, 7, 5), RPSG = c(0, 0, 2, 0,
0, 0, 1, 1, 0, 1, 0, 0, 2), RR = c(9, 7, 9, 0, 1, 1, 8, 10, 3,
9, 0, 0, 7), SH = c(3, 0, 4, 3, 4, 0, 4, 3, 0, 4, 0, 0, 0)), .Names = c("team",
"CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI", "PW", "RCB",
"RPSG", "RR", "SH"), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -13L), vars = list(team), labels = structure(list(
team = c("CSK", "DC", "DD", "GL", "KKR", "KTK", "KXIP", "MI",
"PW", "RCB", "RPSG", "RR", "SH")), class = "data.frame", row.names = c(NA,
-13L), vars = list(team), drop = TRUE, .Names = "team"), indices = list(
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), drop = TRUE, group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L)
In the example below, I will add labels to the mtcars data frame, then run it through bind_rows, and you'll see that the labels are no longer present. This is what I believe is happening to your data.
library(Hmisc)
mtcars2 <- mtcars
label(mtcars2, self = FALSE) <- toupper(names(mtcars))
library(dplyr)
mtcars3 <- bind_rows(mtcars2)
identical(mtcars2, mtcars3)
label(mtcars3)
Related
I am calculating the correlarion between each variable with the target feature, in a dataframe. It works great aside from one variable, Age, which is not producing the correlation, instead I get an NA. I removed all NA values before even starting the analysis. So the data is clean.
This is the code: (PD is the target variable and I want to compare it with all other variables. PD is binary)
pearsons = c()
for (i in 1:length(colnames(Train_set))){
pearsons[i] = cor(Train_set[,i], Train_set$PD, method = 'pearson')
}
This is the data structre: (only some of it)
> glimpse(Train_set)
Rows: 1,219
Columns: 56
$ PD <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,…
$ gender <int> 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2,…
$ cancer_type <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ Treatment <int> 5, 6, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5,…
$ totaldata_new.Age <int> 50, 66, 51, 60, 31, 70, 51, 56, 65, 62, 55, 69, 32, 82, 60, 49, 56, 59, 50, 51, 70, 74…
$ Adipocytes <dbl> 0.000000000, 0.000000000, 0.005592077, 0.005844092, 0.038175712, 0.000000000, 0.005063…
$ B.cells <dbl> 0.045214394, 1.300478781, 0.184967801, 0.032890485, 0.041641426, 0.006477740, 0.653999…
$ Basophils <dbl> 0.120695085, 0.065615816, 0.362173522, 0.039214941, 0.225555640, 0.056926623, 0.019076…
totaldata_new.Age is the Age variable. I tried setting it as.numeric() and as.integer() but both didn't work.
This is the training set,
structure(list(PD = c(0, 0, 1, 1, 1, 1, 0, 0, 1, 1), gender = c(2L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), cancer_type = c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), Treatment = c(5L, 6L, 6L, 6L,
5L, 6L, 5L, 6L, 5L, 5L), totaldata_new.Age = c(50L, 66L, 51L,
60L, 31L, 70L, 51L, 56L, 65L, 62L), Adipocytes = c(0, 0, 0.00559207695850587,
0.00584409167696122, 0.0381757121622292, 0, 0.00506330308366599,
0, 0.0156430635414994, 0), B.cells = c(0.0452143935493372, 1.30047878079526,
0.184967800962064, 0.0328904854435036, 0.0416414264467815, 0.00647774047514386,
0.653999365837062, 0.0331653878504112, 0.0286461940371656, 0.0888471904628742
), Basophils = c(0.120695085116671, 0.0656158162440011, 0.362173521572841,
0.0392149412975555, 0.225555640419744, 0.0569266227666268, 0.0190762558461507,
0.0733199539844435, 0.20291673586147, 0.0757313145147394), CD4..memory.T.cells = c(0,
0.24081994997988, 0, 0.0084070550945875, 0, 0, 0.0704387567897827,
0, 0.0177784010286187, 0.00653794301542519), CD4..naive.T.cells = c(0,
0.222121262122827, 0, 0, 0, 0, 0.0337776019379054, 0, 0, 0)), row.names = c("Pt10",
"Pt101", "Pt103", "Pt106", "Pt11", "Pt17", "Pt18", "Pt26", "Pt27",
"Pt28"), class = "data.frame")
Why is this variable producing NA, while other variables give good results of the correlation?
Looks like there are NA values in the columns of interest.
To avoid this problem, there is the parameter 'use' in the 'cor'-function, which the help explains as:
"giving a method for computing covariances in the presence of missing
values."
I'd recommend changing your code to:
pearsons[i] = cor(Train_set[,i], Train_set$PD, method = 'pearson',
use = "complete.obs")
Hope that helps!
Samuel
I am trying to create a descriptive table (to export to LaTeX) with Stargazer. For some reason, in some datasets, like the one below, it only yields the first line (the column titles of the table).
I am not sure why this happens. It seems there is something wrong with my data. I saw a similar problem here, but the desired table was not of summary statistics. In my case, if I do the solution presente there (turn the dataset into a matrix), the table yielded is of the entire dataset, not descriptive stats.
Below is a code for a sample data and what I tried to produce the tables.
df <- structure(list(country = structure(c(26L, 5L, 26L, 25L, 25L,
3L, 27L, 27L, 27L, 6L, 27L, 26L, 6L, 10L, 1L, 5L, 27L, 20L, 27L,
10L, 26L, 1L, 2L, 15L, 10L, 1L, 26L, 17L, 9L, 7L, 11L, 1L, 26L,
20L, 27L, 6L, 1L, 25L, 9L, 2L, 22L, 15L, 23L, 27L, 9L, 27L, 8L,
27L, 19L, 24L, 20L, 19L, 22L, 26L, 26L, 10L, 10L, 16L, 20L, 25L,
2L, 26L, 1L, 27L, 11L, 1L, 2L, 8L, 25L, 27L, 26L, 9L, 10L, 4L,
9L, 27L, 7L, 19L, 27L, 26L, 6L, 17L, 11L, 16L, 16L, 26L, 27L,
26L, 27L, 14L, 23L, 27L, 23L, 10L, 9L, 23L, 22L, 26L, 26L, 27L
), .Label = c("Austria", "Belgium", "Bulgaria", "Cyprus", "Czech Republic",
"Denmark", "Estonia", "Finland", "France", "Germany", "Greece",
"Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
"Malta", "Netherlands", "Poland", "Portugal", "Romania", "Slovakia",
"Slovenia", "Spain", "Sweden", "United Kingdom"), class = "factor"),
ptyname_eng = structure(c("Centre Party", "Green Party",
"Moderate Party", "Europe of the Peoples-The Greens", "Europe of the Peoples-The Greens",
"Order, Lawfulness, and Justice (RZS)", "Labour", "Green Party",
"Labour", "Liberal Party", "Independent", "Liberal People's Party",
"Danish People's Party", "Free Democratic Party", "Austrian People's Party",
"Christian and Democratic Union-Czechoslovak People's Party",
"Green Party", "Coalition Agreement for the Future - CenterLeft",
"UK Independence Party (UKIP)", "Free Democratic Party",
"Centre Party", "The Greens", "Ecolo", "Libertas", "Free Voters",
"The Greens", "Centre Party", "Christian Social People's Party",
"Europe Écologie", "People's Union of Estonia", "Action",
"Communist Party of Austria", "Centre Party", "Coalition Agreement for the Future - CenterLeft",
"Jury Team", "Social Democrats", "Austrian People's Party",
"Union, Progress, and Democracy", "The Left Front (Left Party + French Communist Party)",
"Ecolo", "Greater Romania Party", "Harmony Centre", "Green Party",
"Green Party", "Socialist Party", "No2EU: Yes to Democracy",
"Social Democratic Party of Finland", "Conservatives", "Libertas",
"For Real", "Civic Platform", "Christian Union-Reformed Political Party",
"Democratic Liberal Party", "Sweden Democrats", "Green Party",
"Free Democratic Party", "Christian Democratic Union/Christian Social Union",
"Civic Democratic Party", "Civic Platform", "Union, Progress, and Democracy",
"Christian Democratic and Flemish Party", "Pirate Party",
"The Greens", "Socialist Labour Party SLP", "New Democracy",
"The Greens", "Christian Democratic and Flemish Party", "Left Alliance",
"Union, Progress, and Democracy", "British National Party (BNP)",
"Left Party", "The Left Front (Left Party + French Communist Party)",
"Christian Social Democrats", "Progressive Party of Working People",
"Socialist Party", "UK Independence Party (UKIP)", "Social Democratic Party",
"Democrats 66", "Mebyon Kernow", "June List", "Socialist People's Party",
"Christian Social People's Party", "New Democracy", "Frontas Party",
"Homeland Union - Lithuanian Christian Democrats", "Liberal People's Party",
"No2EU: Yes to Democracy", "Centre Party", "Pro Democracy: Libertas.eu",
"Anticapitalist List", "Conservative Democrats of Slovakia",
"Christian Party - Christian People's Alliance", "Slovak Democratic and Christian Union",
"Free Democratic Party", "Europe Écologie", "Direction - Social Democracy",
"Democratic Union of Hungarians in Romania", "Centre Party",
"Liberal People's Party", "Conservatives"), label = "Party name (in English)", format.stata = "%75s"),
votes = c(5.47, 2.06, 18.83, 2.49, 2.49, 4.67, 15.7, 8.6,
15.7, 20.23, NA, 13.58, 15.28, 11, 29.98, 7.64, 8.6, 2.44,
16.5, 11, 5.47, 9.93, 8.64, 4.3, 1.7, 9.93, 5.47, 31.32,
16.28, 2.2, 0.76, 0.66, 5.47, 2.44, 0.5, 21.49, 29.98, 2.85,
6.05, 8.64, 8.65, 19.57, 2.11, 8.6, 16.48, 1.01, 17.5, 27.7,
0.32, 9.76, 44.43, 6.82, 29.71, 3.27, 11.02, 11, 30.7, 1.35,
44.43, 2.85, 14.43, 7.13, 9.93, 1.1, 32.3, 9.93, 14.43, 5.9,
2.85, 6.2, 5.66, 6.05, 7.2, 34.9, 16.48, 16.5, 8.7, 11.32,
0.1, 3.55, 15.86, 31.32, 32.3, 2.43, 26.86, 13.58, 1.01,
5.47, 0.5, 3.39, 2.1, 1.6, 16.98, 11, 16.28, 32.01, 8.92,
5.47, 13.58, 27.7), seats = c(1, 0, 4, 1, 1, 0, 13, 2, 13,
3, NA, 3, 2, 12, 6, 2, 2, 0, 13, 12, 1, 2, 2, 0, 0, 2, 1,
3, 14, 0, 0, 0, 1, 0, 0, 4, 6, 1, 4, 2, 3, 2, 0, 2, 14, 0,
2, 25, 0, 1, 25, 2, 10, 0, 2, 12, 34, 0, 25, 1, 3, 1, 2,
0, 8, 2, 3, 0, 1, 2, 1, 4, 8, 2, 14, 13, 1, 3, 0, 0, 2, 3,
8, 0, 4, 3, 0, 1, 0, 0, 0, 0, 2, 12, 14, 5, 3, 1, 3, 25),
lsq = c(5.65121280548163, 11.0409569967897, 5.65121280548163,
2.09070598337411, 2.09070598337411, 18.4291883786975, 7.64222053188085,
7.64222053188085, 7.64222053188085, 8.49009926343377, NA,
5.65121280548163, 8.49009926343377, 4.03227351136326, 5.21894849144935,
11.0409569967897, 7.64222053188085, 4.87412556260095, 7.64222053188085,
4.03227351136326, 5.65121280548163, 5.21894849144935, 4.16813185258047,
9.99651439252703, 4.03227351136326, 5.21894849144935, 5.65121280548163,
14.683991850538, 10.16545, 10.829844309951, 3.4795321239576,
5.21894849144935, 5.65121280548163, 4.87412556260095, 7.64222053188085,
8.49009926343377, 5.21894849144935, 2.09070598337411, 10.1654489717407,
4.16813185258047, 2.66691671130863, 9.99651439252703, 7.88621704182489,
7.64222053188085, 10.1654489717407, 7.64222053188085, 6.48086623985829,
7.64222053188085, 3.63340749159794, 10.4201368043039, 4.87412556260095,
3.63340749159794, 2.66691671130863, 5.65121280548163, 5.65121280548163,
4.03227351136326, 4.03227351136326, 8.7655582315738, 4.87412556260095,
2.09070598337411, 4.16813185258047, 5.65121280548163, 5.21894849144935,
7.64222053188085, 3.4795321239576, 5.21894849144935, 4.16813185258047,
6.48086623985829, 2.09070598337411, 7.64222053188085, 5.65121280548163,
10.1654489717407, 4.03227351136326, 6.81012399952372, 10.1654489717407,
7.64222053188085, 10.829844309951, 3.63340749159794, 7.64222053188085,
5.65121280548163, 8.49009926343377, 14.683991850538, 3.4795321239576,
8.7655582315738, 8.7655582315738, 5.65121280548163, 7.64222053188085,
5.65121280548163, 7.64222053188085, 6.15555836341693, 7.88621704182489,
7.64222053188085, 7.88621704182489, 4.03227351136326, 10.16545,
7.88621704182489, 2.66691671130863, 5.65121280548163, 5.65121280548163,
7.64222053188085), v020_03 = c(5, 5, 3, 5, 5, 3, 5, 5, 3,
NA, 5, 5, 3, 4, 3, 2, 5, NA, 4, 5, 5, 5, 5, NA, 2, 5, 4,
5, 5, 2, NA, NA, 5, 5, 2, 5, 3, 4, 5, 5, 3, 5, 5, 5, 5, 5,
5, 3, 3, NA, 2, 2, 3, 1, 5, 5, 1, 3, 3, NA, 1, 5, 5, 5, 3,
5, 5, 5, 4, 2, NA, 5, 1, 3, 2, 3, NA, 5, 5, 4, NA, 4, 2,
NA, 2, 5, 5, 5, 3, 5, 2, 1, 3, 3, 5, NA, 4, 5, 5, 1), v020_04 = c(5,
4, NA, 1, 3, 5, 3, 2, 2, NA, 2, 4, 4, 5, 5, 2, 1, NA, 4,
5, 5, 2, 5, NA, 3, 3, 4, 4, 3, 4, NA, NA, 3, 3, 4, 1, 5,
4, 2, 5, 1, 5, 3, 4, 2, 1, 4, 5, 3, NA, 3, 4, 4, 4, 3, 5,
5, 2, 4, NA, 1, 1, 3, 1, 3, 2, 2, 2, 3, 3, NA, 1, 1, 3, 2,
4, NA, 5, 2, 3, NA, 5, 2, NA, 2, 2, 1, 4, 5, 3, 4, 3, 4,
5, 1, NA, 3, 2, 4, 5), v020_08 = c(5, 3, 3, 1, 3, 5, 3, 1,
2, NA, 2, 3, 4, 5, 5, 5, 1, NA, 5, 5, NA, 2, 3, NA, 4, 2,
3, 4, 1, 3, NA, NA, 3, 2, 3, 1, 4, 2, 1, 3, 1, 2, 3, 2, 1,
1, 3, 3, 3, NA, 4, 2, 4, 3, 2, 2, 5, 3, 4, NA, 2, 3, 1, 1,
3, 3, 2, 2, 3, 3, NA, 1, 5, 4, 1, 3, NA, 4, 2, 3, NA, 2,
3, NA, 2, 4, 1, 4, 1, 1, 4, 3, 4, 4, 1, NA, 5, 5, 3, 3),
v018_1 = c(8, 5, 5, 1, 3, 9, 3, 2, 2, NA, NA, 5, 5, 5, 7,
7, 1, NA, 7, 5, 6, 4, 3, NA, 8, 4, 5, 6, 0, 5, NA, NA, 5,
2, 7, 3, 8, 4, 0, 4, 0, 7, 7, NA, 0, 0, 3, 8, 5, NA, 7, 5,
10, 8, 4, 5, 8, 5, 8, NA, 5, 2, 0, 10, 6, 3, 3, 2, 4, 6,
NA, 0, 8, 4, 0, 6, NA, 4, 3, 5, NA, 7, 6, NA, 9, 8, 0, 6,
5, 0, 8, 6, 8, 5, 2, NA, 6, 5, 7, 7), v020_05 = c(1, 1, 1,
1, 3, 1, 1, 1, 2, NA, 1, 2, 4, 2, 4, 3, 1, NA, 3, 2, 1, 1,
4, NA, 2, 1, 2, 3, 1, 1, NA, NA, 2, 1, 1, 1, 2, 4, 1, 2,
1, 1, 1, 1, 1, 1, 1, 1, 3, NA, 5, 4, 1, 4, 2, 2, 1, 4, 2,
NA, 3, 1, 1, 1, 1, 1, 2, 1, 3, 4, NA, 1, 4, 3, 1, 3, NA,
1, 1, 2, NA, 4, 1, NA, 4, 2, 1, 1, 3, 1, 4, 2, 2, 4, 1, NA,
1, 2, 1, 5), v020_02 = c(1, 3, 1, 5, 4, 1, 3, 5, 4, NA, 3,
2, 3, 2, 1, 3, 5, NA, 1, 1, 1, 5, 4, NA, 1, 2, 2, 1, 5, 2,
NA, NA, 3, 3, 1, 5, 3, 2, 5, 4, 1, 2, 2, 5, 5, 5, 3, 3, 1,
NA, 1, 4, 1, 2, 4, 1, 1, 3, 1, NA, 3, 4, 4, 4, 3, 4, 5, 4,
2, 3, NA, 5, 1, 2, 2, 1, NA, 2, 3, 2, NA, 1, 2, NA, 2, 2,
5, 2, 1, 4, 2, 4, 2, 2, NA, NA, 2, 3, 1, 1)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
stargazer (df)
stargazer(as.matrix(df)
df <- as.matrix(df)
stargazer(df)
Convert the data to dataframe.
stargazer::stargazer(data.frame(df))
With type = 'text'
stargazer::stargazer(data.frame(df), type = 'text')
===========================================================
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
-----------------------------------------------------------
votes 99 11.784 10.443 0.100 3.330 16.380 44.430
seats 99 4.515 6.649 0.000 0.500 4.000 34.000
lsq 99 6.627 2.903 2.091 4.521 7.886 18.429
v020_03 88 3.852 1.352 1.000 3.000 5.000 5.000
v020_04 87 3.172 1.340 1.000 2.000 4.000 5.000
v020_08 87 2.828 1.278 1.000 2.000 4.000 5.000
v018_1 86 4.767 2.660 0.000 3.000 7.000 10.000
v020_05 88 1.966 1.198 1.000 1.000 3.000 5.000
v020_02 87 2.667 1.403 1.000 1.000 4.000 5.000
-----------------------------------------------------------
We can also convert to tibble
stargazer::stargazer(as_tibble(df))
I'm curretly learning R with a book and was trying a mutate_at function from dplyr. In this example I want to standardize the survey items on a scale from 0 to 1. To do this, we can divide each value by the (theoretical) maximum value of the scale.
The book example stats_test from the package "pradadata" works perfectly fine:
data(stats_test, package = "pradadata")
stats_test %>%
drop_na() %>%
mutate_at(.vars = vars(study_time, self_eval, interest),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
study_time_prop self_eval_prop interest_prop
<dbl> <dbl> <dbl>
1 0.6 0.7 0.667
2 0.8 0.8 0.833
3 0.6 0.4 0.167
4 0.8 0.7 0.833
5 0.4 0.6 0.5
6 0.4 0.6 0.667
7 0.8 0.6 0.5
8 0.2 0.7 0.667
9 0.6 0.8 0.833
10 0.6 0.7 0.833
# ... with 1,617 more rows
Tried the same code with my own data but it doesn't work and I can't figure out why. The variable RG04 from my data has a range from 1-5. I tried to transform the variable from numeric to integer, because the variables from the the data stats_test are integer too:
df_literacy_2 <- transform(df_literacy, RG04 = as.integer(RG04))
df_literacy_2 <- tibble(df_literacy_2)
df_literacy_2 %>%
drop_na() %>%
mutate_at(.vars = vars(RG04),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
# A tibble: 0 x 0
Warning messages:
1: Problem with `mutate()` input `prop`.
i no non-missing arguments to max; returning -Inf
i Input `prop` is `RG04/max(RG04)`.
2: In base::max(x, ..., na.rm = na.rm) :
no non-missing arguments to max; returning -Inf
str(df_literacy_2$RG04)
int [1:630] 2 4 2 1 2 2 1 3 1 3 ...
Why doesn't it work on my data?
Thank you for your help.
Edit with sample of df_literacy:
> dput(head(df_literacy,20))
structure(list(CASE = c(40, 41, 44, 45, 48, 49, 54, 55, 56, 57,
58, 61, 62, 63, 64, 65, 66, 67, 68, 69), SERIAL = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), REF = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), QUESTNNR = c("base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base"), MODE = c("interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview"), STARTED = structure(c(1607290462,
1607290608, 1607291086, 1607291118, 1607291265, 1607291793, 1607294071,
1607294336, 1607294337, 1607294419, 1607294814, 1607296474, 1607301809,
1607329348, 1607333933, 1607335996, 1607336207, 1607336378, 1607343194,
1607343414), tzone = "UTC", class = c("POSIXct", "POSIXt")),
EI01 = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("Ja",
"Nein", "Nicht beantwortet"), class = "factor"), EI02 = c(2,
2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 3),
RF01 = c(4, 2, 4, 3, 4, 4, 1, 3, 2, 3, 4, 3, 2, 3, 2, 2,
4, 2, 5, 3), RF02 = c(1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1,
1, 1, 2, 2, 2, 2, 2, 2), RF03 = c(1, 2, 2, 2, 1, 2, 1, 1,
1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2), RG01 = c(2, 2, 2, 2,
2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2), RG02 = c(3,
3, 3, 3, 4, 3, 4, 2, 4, 2, 3, 4, 4, 2, 4, 3, 4, 3, 4, 4),
RG03 = c(3, 2, 2, 3, 3, 3, 1, 3, 1, 2, 3, 1, 2, 2, 1, 3,
2, 3, 2, 2), RG04 = c(2, 4, 2, 1, 2, 2, 1, 3, 1, 3, 2, 4,
1, 1, 1, 1, 1, 2, 4, 1), RG05 = c(1, 1, 1, 1, 1, 1, 1, 2,
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1), SD01 = structure(c(2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L), .Label = c("weiblich", "männlich", "divers",
"nicht beantwortet"), class = "factor"), SD03 = c(4, 3, 2,
2, 1, 2, 4, 4, 1, 4, 3, 1, 2, 3, 2, 4, 2, 3, 1, 3), SD05_01 = c(23,
22, 22, 21, 18, 22, 21, 27, 17, 22, 17, 21, 21, 22, 50, 25,
23, 20, 23, 23), TIME001 = c(2, 3, 23, 73, 29, 2, 3, 3, 29, 7,
50, 55, 3, 2, 10, 2, 1, 5, 7, 35), TIME002 = c(2, 2, 16,
34, 12, 14, 2, 2, 21, 2, 30, 24, 21, 3, 3, 2, 3, 2, 3, 22
), TIME003 = c(34, 8, 12, 15, 13, 12, 12, 7, 13, 11, 16,
10, 11, 16, 8, 8, 7, 8, 11, 14), TIME004 = c(60, 33, 25,
31, 45, 25, 14, 13, 38, 35, 50, 50, 37, 32, 32, 25, 72, 55,
28, 29), TIME005 = c(84, 21, 29, 41, 54, 33, 30, 22, 32,
42, 44, 23, 65, 30, 28, 32, 51, 31, 27, 44), TIME006 = c(14,
9, 27, 11, 24, 8, 8, 9, 18, 12, 35, 33, 27, 46, 11, 15, 8,
14, 12, 14), TIME007 = c(3, 18, 3, 5, 6, 2, 9, 2, 3, 3, 6,
7, 3, 13, 4, 4, 378, 3, 4, 10), TIME_SUM = c(199, 94, 135,
142, 183, 96, 78, 58, 154, 112, 186, 152, 167, 142, 96, 88,
146, 118, 92, 168), MAILSENT = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
LASTDATA = structure(c(1607290661, 1607290702, 1607291221,
1607291328, 1607291448, 1607291889, 1607294149, 1607294394,
1607294491, 1607294531, 1607295045, 1607296676, 1607301976,
1607329490, 1607334030, 1607336084, 1607336727, 1607336496,
1607343286, 1607343582), tzone = "UTC", class = c("POSIXct",
"POSIXt")), FINISHED = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1), Q_VIEWER = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), LASTPAGE = c(7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
MAXPAGE = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7), MISSING = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 0, 7, 7, 7), MISSREL = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1), TIME_RSI = c("46023",
"14246", "0.75", "0.63", "0.54", "12055", "17533", "30682",
"0.7", "44197", "0.45", "0.58", "0.83", "44378", "44501",
"18629", "46753", "46388", "44197", "0.57"), DEG_TIME = c(27,
27, 3, 1, 0, 23, 30, 42, 2, 17, 0, 2, 7, 18, 10, 27, 43,
18, 8, 0)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Edit with TRUE and FALSE NAs:
> sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02 RG03 RG04 RG05 SD01 SD03 SD05_01 TE03_01 TIME001 TIME002 TIME003
FALSE 630 0 0 630 630 630 630 630 630 630 630 630 630 630 630 630 629 629 615 99 630 630 630
TRUE 0 630 630 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 15 531 0 0 0
TIME004 TIME005 TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
FALSE 630 630 629 625 630 0 630 630 630 630 630 630 630 630 630
TRUE 0 0 1 5 0 630 0 0 0 0 0 0 0 0 0
There are a few things to correct here.
drop_na() is removing all of your data.
drop_na(df_literacy)
# # A tibble: 0 x 37
# # ... with 37 variables: CASE <dbl>, SERIAL <lgl>, REF <lgl>, QUESTNNR <chr>,
# # MODE <chr>, STARTED <dttm>, EI01 <fct>, EI02 <dbl>, RF01 <dbl>, RF02 <dbl>,
# # RF03 <dbl>, RG01 <dbl>, RG02 <dbl>, RG03 <dbl>, RG04 <dbl>, RG05 <dbl>,
# # SD01 <fct>, SD03 <dbl>, SD05_01 <dbl>, TIME001 <dbl>, TIME002 <dbl>,
# # TIME003 <dbl>, TIME004 <dbl>, TIME005 <dbl>, TIME006 <dbl>, TIME007 <dbl>,
# # TIME_SUM <dbl>, MAILSENT <lgl>, LASTDATA <dttm>, FINISHED <dbl>,
# # Q_VIEWER <dbl>, LASTPAGE <dbl>, MAXPAGE <dbl>, MISSING <dbl>,
# # MISSREL <dbl>, TIME_RSI <chr>, DEG_TIME <dbl>
The problem is that you have several columns that are completely NA, namely SERIAL, REF, and MAILSENT.
sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
# CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02
# FALSE 20 0 0 20 20 20 20 20 20 20 20 20 20
# TRUE 0 20 20 0 0 0 0 0 0 0 0 0 0
# RG03 RG04 RG05 SD01 SD03 SD05_01 TIME001 TIME002 TIME003 TIME004 TIME005
# FALSE 20 20 20 20 20 20 20 20 20 20 20
# TRUE 0 0 0 0 0 0 0 0 0 0 0
# TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE
# FALSE 20 20 20 0 20 20 20 20
# TRUE 0 0 0 20 0 0 0 0
# MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
# FALSE 20 20 20 20 20
# TRUE 0 0 0 0 0
Drop the drop_na(), or at least drop_na(-SERIAL, -REF, -MAILSENT).
Your code is using funs, which has been deprecated since dplyr-0.8.0.
# Warning: `funs()` is deprecated as of dplyr 0.8.0.
# Please use a list of either functions or lambdas:
# # Simple named list:
# list(mean = mean, median = median)
# # Auto named with `tibble::lst()`:
# tibble::lst(mean, median)
# # Using lambdas
# list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
While this isn't causing an error, it is causing a warning (and will likely stop working at some point. Change your mutate_at to be:
mutate_at(.vars = vars(RG04, RF02),
.funs = list(prop = ~ . / max(.)))
You are using a single variable within .vars and a single function within .funs, so the column names are preserved as-is (and you will not see a _prop column). From ?mutate_at:
The names of the new columns are derived from the names of the
input variables and the names of the functions.
• if there is only one unnamed function (i.e. if '.funs' is an
unnamed list of length one), the names of the input variables
are used to name the new columns;
• for _at functions, if there is only one unnamed variable
(i.e., if '.vars' is of the form 'vars(a_single_column)') and
'.funs' has length greater than one, the names of the
functions are used to name the new columns;
• otherwise, the new names are created by concatenating the
names of the input variables and the names of the functions,
separated with an underscore '"_"'.
If you aren't going to add more variables and functions, then you need to self-name it in the call, as in mutate_at(.vars = vars(RG04 = RG04), ...). Oddly enough, this causes it to produce RG04_prop.
If we fix all of those, then it works.
df_literacy %>%
drop_na(-SERIAL, -REF, -MAILSENT) %>%
mutate_at(.vars = vars(RG04 = RG04),
.funs = list(prop = ~ ./max(.))) %>%
select(contains("_prop")) %>%
head(3)
# A tibble: 3 x 1
# RG04_prop
# <dbl>
# 1 0.5
# 2 1
# 3 0.5
I have two lists of matrices. Here is an example of their structure:
list1<- list(structure(c(1, 2, 7, 1, 3, 0, 0, 0, 1, 4, 1, 3, 2, 3, 4,
6, 0, 0, 0, 3, 3), .Dim = c(7L, 3L), .Dimnames = list(c("lepA",
"lepB", "lepC", "lepD", "lepE", "lepF", "lepG"), NULL)), structure(c(1,
3, 7, 1, 3, 2, 3, 4, 6, 4, 1, 3, 3, 3), .Dim = c(7L, 2L), .Dimnames = list(
c("lepA", "lepB", "lepC", "lepD", "lepE", "lepF", "lepG"),
NULL)), structure(c(5, 8, 7, 1, 3, 3, 3), .Dim = c(7L, 1L
), .Dimnames = list(c("lepA", "lepB", "lepC", "lepD", "lepE",
"lepF", "lepG"), NULL)))
list2<-list(structure(c(6, 1, 51, 13, 15, 0, 0, 0, 6, 50, 13, 15, 6,
5, 5, 9, 0, 0, 0, 7, 5), .Dim = c(7L, 3L), .Dimnames = list(c("lepA",
"lepB", "lepC", "lepD", "lepE", "lepF", "lepG"), NULL)), structure(c(6,
7, 51, 13, 15, 6, 5, 5, 9, 50, 13, 15, 7, 5), .Dim = c(7L, 2L
), .Dimnames = list(c("lepA", "lepB", "lepC", "lepD", "lepE",
"lepF", "lepG"), NULL)), structure(c(11, 10, 51, 13, 15, 7, 5
), .Dim = c(7L, 1L), .Dimnames = list(c("lepA", "lepB", "lepC",
"lepD", "lepE", "lepF", "lepG"), NULL)))
I need to divide each element of each matrix in a list with the corresponding element in the matching matrix in the second list. It's as though the two lists of matrices should be one list of arrays and the dividend is calculated for each array element. The result would be:
list<- list(list1[[1]]/list2[[1]], list1[[2]]/list2[[2]], list1[[3]]/list2[[3]])
I tried:
list1/list2
Use Map :
Map(`/`, list1, list2)
#[[1]]
# [,1] [,2] [,3]
#lepA 0.16666667 NaN 0.8000000
#lepB 2.00000000 0.16666667 0.6666667
#lepC 0.13725490 0.08000000 NaN
#lepD 0.07692308 0.07692308 NaN
#lepE 0.20000000 0.20000000 NaN
#lepF NaN 0.33333333 0.4285714
#lepG NaN 0.60000000 0.6000000
#[[2]]
# [,1] [,2]
#lepA 0.16666667 0.80000000
#lepB 0.42857143 0.66666667
#lepC 0.13725490 0.08000000
#lepD 0.07692308 0.07692308
#lepE 0.20000000 0.20000000
#lepF 0.33333333 0.42857143
#lepG 0.60000000 0.60000000
#[[3]]
# [,1]
#lepA 0.45454545
#lepB 0.80000000
#lepC 0.13725490
#lepD 0.07692308
#lepE 0.20000000
#lepF 0.42857143
#lepG 0.60000000
Or map2 in purrr
purrr::map2(list1, list2, `/`)
We can use seq_along with lapply
lapply(seq_along(list1), function(i) list1[[i]]/list2[[i]])
I have a dataset that looks like
structure(list(ID = 1:100, A = c(1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
0, 1, 1, 1, 1, 1), B = c(-0.779571910800539, -1.01813937566596,
-0.617199891458882, 0.0309580500019241, 0.543273110365976, -0.0433300396605711,
0.230179974373525, -0.183807679340284, 1.23013876772693, -0.447068495884132,
-0.529019912858711, -0.423813233701193, -2.02301749716477, 0.107354643198155,
-0.182036878855649, -0.0686544314102692, -0.242211690200168,
0.235711424228903, -0.737085567507625, 1.08272499166402, 1.46797946789107,
0.676133655269793, 0.0970319828391364, -0.175265540837544, 1.01932401639564,
-1.6120456930373, -0.237498813763703, -1.0793071544667, 0.34060211076922,
0.358651319904244, 1.14185300245182, 0.643831607010375, -1.48935271976024,
1.52070114310115, 0.13758246936271, 0.677489791752007, -0.0421866338789382,
-0.963909996107064, -0.419518874496373, -1.94843733945541, -0.856606011022689,
0.950271505971139, -0.00501879225795071, -0.907348953277799,
0.176003279346265, 0.849120713832199, -0.682289211320935, 0.618834674100358,
-0.266654135174762, 1.38431159868239, 0.464047120137739, -0.478626559461985,
0.149837396236788, -1.22592409132424, 0.658992970998059, -0.755502690343619,
-1.64278237304159, 0.9123549798475, 0.212894692780789, -0.670549407572393,
2.37707712870178, -0.0295080172428597, -0.823140252108969, -0.428902533453998,
-0.435036177848892, 0.98534295091355, 1.24538388550067, 0.763169631787973,
0.0481870286750498, 0.373727588477095, 0.515173230638657, -0.980950523005618,
2.34498921196051, 1.16497367254483, 0.803207456941987, -1.20555741222113,
-1.69603664220648, -0.59655174894536, -0.471190748123387, 1.53055765388398,
0.426904841661558, -0.385574044956116, -1.05023815909094, -1.45225542235577,
-0.545485253245417, 0.173122341859165, -1.23651408987118, 0.438591835746343,
-0.826135255947115, 0.371873486298494, -0.422519474801474, -0.34343504002476,
-0.508591050193541, -1.64448384253113, -0.217712097435782, -0.396102247417337,
-0.324089563130585, 1.3108035615729, -1.74881781621313, -0.887343297491297
), C = c(2, 1, 2, 2, 1, 2, 1, 2, 3, 1, 1, 3, 2, 2, 3, 3, 2, 1,
1, 2, 3, 2, 3, 2, 2, 2, 3, 3, 3, 2, 2, 1, 3, 3, 2, 3, 3, 3, 3,
3, 1, 1, 2, 1, 1, 3, 3, 2, 3, 3, 1, 3, 1, 1, 2, 1, 1, 2, 1, 2,
2, 3, 2, 3, 3, 1, 2, 1, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 1,
3, 3, 3, 3, 1, 1, 2, 1, 3, 2, 3, 3, 3, 1, 2, 2, 3, 3, 2), D = c(3,
2, 0, 1, 0, 2, 1, 1, 1, 2, 1, 3, 1, 0, 1, 2, 1, 1, 1, 3, 0, 3,
0, 0, 1, 3, 0, 3, 2, 1, 3, 1, 3, 0, 2, 1, 2, 0, 2, 2, 0, 0, 0,
3, 3, 3, 3, 2, 3, 2, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2, 2, 3, 0, 1,
0, 3, 3, 1, 2, 1, 1, 0, 1, 0, 3, 1, 1, 1, 0, 2, 0, 3, 2, 3, 2,
2, 3, 3, 1, 2, 3, 3, 1, 2, 3, 2, 3, 3, 0, 2), E = c(0, 1, 0,
1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1), F = c(14L, 12L, 8L, 5L,
13L, 8L, 8L, 9L, 11L, 13L, 11L, 8L, 12L, 9L, 8L, 17L, 11L, 13L,
7L, 13L, 9L, 9L, 11L, 7L, 11L, 13L, 14L, 10L, 12L, 15L, 5L, 12L,
7L, 8L, 10L, 11L, 5L, 10L, 2L, 10L, 9L, 14L, 4L, 10L, 6L, 14L,
10L, 6L, 14L, 2L, 7L, 11L, 9L, 8L, 11L, 9L, 15L, 10L, 16L, 11L,
7L, 8L, 12L, 17L, 5L, 13L, 15L, 11L, 10L, 7L, 6L, 12L, 10L, 8L,
7L, 8L, 11L, 14L, 6L, 4L, 9L, 11L, 9L, 13L, 7L, 9L, 9L, 12L,
10L, 6L, 10L, 5L, 14L, 10L, 13L, 6L, 8L, 8L, 7L, 12L)), .Names = c("ID",
"A", "B", "C", "D", "E", "F"), row.names = c(NA, -100L), class = "data.frame")
However, my actual dataset has 100 columns. I would like to change any variable that has less than or equal to 5 unique values to a factor. I am using dplyr with the following code:
df %>%
mutate_if(is.integer, as.numeric) %>%
mutate_if(length(unique(.)) <= 5, as.factor(.))
But I get the error:
Error: length(.p) == length(vars) is not TRUE
Any thoughts? I want to convert those with less than or equal 5 unique values into factors.
You're close, just missing a couple ~s to signal a purrr-style anonymous function:
library(dplyr)
df %>% mutate_if(~length(unique(.x)) <= 5, ~as.factor(.x))
result:
> df %>% mutate_if(~length(unique(.x)) <= 5, ~as.factor(.x)) %>% glimpse()
Observations: 100
Variables: 7
$ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,...
$ A <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0...
$ B <dbl> -0.779571911, -1.018139376, -0.617199891, 0.030958050, 0.543273110, -0.043330040, 0.230179974, -0.183807679, 1.230138768, -0.44706...
$ C <fct> 2, 1, 2, 2, 1, 2, 1, 2, 3, 1, 1, 3, 2, 2, 3, 3, 2, 1, 1, 2, 3, 2, 3, 2, 2, 2, 3, 3, 3, 2, 2, 1, 3, 3, 2, 3, 3, 3, 3, 3, 1, 1, 2, 1...
$ D <fct> 3, 2, 0, 1, 0, 2, 1, 1, 1, 2, 1, 3, 1, 0, 1, 2, 1, 1, 1, 3, 0, 3, 0, 0, 1, 3, 0, 3, 2, 1, 3, 1, 3, 0, 2, 1, 2, 0, 2, 2, 0, 0, 0, 3...
$ E <fct> 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0...
$ F <int> 14, 12, 8, 5, 13, 8, 8, 9, 11, 13, 11, 8, 12, 9, 8, 17, 11, 13, 7, 13, 9, 9, 11, 7, 11, 13, 14, 10, 12, 15, 5, 12, 7, 8, 10, 11, 5...
Also note: if I remember correctly, . and .x in the anonymous functions can be used interchangeably, I am in the habit of using .x in case there is more than 1 argument (e.g., purrr::map2)