as.numeric creates hanging decimal - r

Curious whats causing this behavior. Reading in an excel file with numbers stored as text. When I convert to numeric, integers get a decimal point added (i.e. 153 becomes 153.) No 0, nothing, just a hanging decimal. Any idea on how to correct this?
structure(list(`Nest ID` = c("21Lk", "21Lk", "21Lk", "21Lk",
"A-Frye"), `Clutch size` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), `hatch size` = c("4", "4", "4",
"4", "7"), `hatch date` = c("146", "146", "146", "146", "153"
), Date = c("149", "167", "188", "247", "161"), Time = c("900",
"900", "1200", "1224", "1538"), Cygnets = c("4", "3", NA, NA,
"7"), detection = c("1", "1", "0", "0", "1"), `Age when found (days)` =
c("3",
"3", "3", "3", "8"), `Age at time of observation (days)` = c("3",
"21", "42", "101", "8"), `prob of complete brood loss` = c("0.5",
"0.5", "0.5", "0.5", "0.1"), `Predator fish present (1 likely, 2 unknown, 0
unlikely)` = c("1",
"1", "1", "1", "1"), `Wetland Area (NWI) (ha)` = c("109.4938966",
"109.4938966", "109.4938966", "109.4938966", "34.923899399999996"
), Notes = c(NA, NA, "unsure, could have been in the deep veg",
"unsure, could have been in the deep veg", NA)), .Names = c("Nest ID",
"Clutch size", "hatch size", "hatch date", "Date", "Time", "Cygnets",
"detection", "Age when found (days)", "Age at time of observation (days)",
"prob of complete brood loss", "Predator fish present (1 likely, 2 unknown,
0 unlikely)",
"Wetland Area (NWI) (ha)", "Notes"), row.names = c(NA, -5L), class =
c("tbl_df",
"tbl", "data.frame"))
and the code used to convert/cleanup
library(tidyverse)
test.2<-test%>%
rename(NestID=`Nest ID`,
CS=`Clutch size`,
HS=`hatch size`,
HD=`hatch date`,
DOY=Date,
Age.firstobs=`Age when found (days)`,
Age.current=`Age at time of observation (days)`,
is.broodloss=`prob of complete brood loss`,
pred.fish=`Predator fish present (1 likely, 2 unknown, 0 unlikely)`,
WA=`Wetland Area (NWI) (ha)`)%>%
mutate_at(vars(CS:DOY,Cygnets,Age.firstobs,Age.current,pred.fish),funs(as.numeric))%>%
mutate_at(vars(detection,is.broodloss,WA),funs(as.numeric))

Related

R - Dataframe is setting an arbitrary max of 10?

I am working on a personal project to visualize some NBA data, and when sorting within the dataframe created from reading a csv file, it seems to set 10 as the max value for all categories ( such as points, FGA, etc). Does anyone know why/how to "uncap" this?
Ex: Steph Curry can be seen here with 3PA greater than 10.
However, max(stats$`3pa`) returns 9.2, same thing occurs with other categories such as Points, where max(stats$PTS) returns 9.9.
EDIT: dput(head(stats)):
> dput(head(stats))
structure(list(Player = c("Precious Achiuwa", "Steven Adams",
"Bam Adebayo", "LaMarcus Aldridge", "Ty-Shon Alexander", "Nickeil Alexander-Walker"
), Pos = c("PF", "C", "C", "C", "SG", "SG"), Age = c("21", "27",
"23", "35", "22", "22"), Tm = c("MIA", "NOP", "MIA", "TOT", "PHO",
"NOP"), G = c("61", "58", "64", "26", "15", "46"), GS = c("4",
"58", "64", "23", "0", "13"), MP = c("12.1", "27.7", "33.5",
"25.9", "3.1", "21.9"), FG = c("2.0", "3.3", "7.1", "5.4", "0.2",
"4.2"), FGA = c("3.7", "5.3", "12.5", "11.4", "0.8", "10.0"),
`FG%` = c(".544", ".614", ".570", ".473", ".250", ".419"),
`3P` = c("0.0", "0.0", "0.0", "1.2", "0.1", "1.7"), `3PA` = c("0.0",
"0.1", "0.1", "3.1", "0.6", "4.8"), `3P%` = c(".000", ".000",
".250", ".388", ".222", ".347"), `2P` = c("2.0", "3.3", "7.1",
"4.2", "0.1", "2.5"), `2PA` = c("3.7", "5.3", "12.4", "8.3",
"0.2", "5.2"), `2P%` = c(".546", ".620", ".573", ".505",
".333", ".485"), `eFG%` = c(".544", ".614", ".571", ".525",
".333", ".502"), FT = c("0.9", "1.0", "4.4", "1.6", "0.1",
"1.0"), FTA = c("1.8", "2.3", "5.5", "1.8", "0.1", "1.4"),
`FT%` = c(".509", ".444", ".799", ".872", ".500", ".727"),
ORB = c("1.2", "3.7", "2.2", "0.7", "0.1", "0.3"), DRB = c("2.2",
"5.2", "6.7", "3.8", "0.5", "2.8"), TRB = c("3.4", "8.9",
"9.0", "4.5", "0.7", "3.1"), AST = c("0.5", "1.9", "5.4",
"1.9", "0.4", "2.2"), STL = c("0.3", "0.9", "1.2", "0.4",
"0.0", "1.0"), BLK = c("0.5", "0.7", "1.0", "1.1", "0.1",
"0.5"), TOV = c("0.7", "1.3", "2.6", "1.0", "0.2", "1.5"),
PF = c("1.5", "1.9", "2.3", "1.8", "0.1", "1.9"), PTS = c("5.0",
"7.6", "18.7", "13.5", "0.6", "11.0"), c(" ", " ", " ", " ",
" ", " ")), row.names = c("1", "3", "4", "5", "6", "7"), class = "data.frame")
EDIT2: Update
as.numeric(max(stats$3PA)) seems to work, thank you all for bearing with my stupidity!
Seems like the problem was the vectors being read as strings rather than doubles, so the simple addition of as.numeric() seemed to fix it!

Plotting multiple binary variables on the same plot in ggplot

I am hoping to use ggplot to construct a barplot of frequencies (or just % 1s) of a bunch of binary variables, and am having trouble getting them all together on one plot.
The variables all stem from the same question in a survey, so ideally it'd be nice to have data that is tidy with one column for this variable, but respondents could select more than one option and I'm hoping to retain that instead of having a "more than one selected" option. Here is a slice of the data:
structure(list(gender = structure(c("Male", "Male", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male"), label = "Q4", format.stata = "%24s"),
var1 = structure(c("0", "0", "1", "1", "0", "0", "0", "0",
"0", "0"), format.stata = "%9s"), var2 = structure(c("0",
"98", "1", "0", "0", "0", "0", "0", "0", "0"), format.stata = "%9s"),
var3 = structure(c("0", "0", "0", "0", "0", "0", "0", "0",
"0", "0"), format.stata = "%9s"), var4 = structure(c("1",
"0", "1", "0", "0", "0", "1", "1", "0", "0"), format.stata = "%9s"),
var5 = structure(c("1", "0", "0", "0", "0", "1", "0", "0",
"0", "0"), format.stata = "%9s")), row.names = c(NA, -10L
), class = c("tbl_df", "tbl", "data.frame"))
Get the data in long format so that it is easier to plot.
library(tidyverse)
df %>%
pivot_longer(cols = starts_with('var')) %>%
group_by(name) %>%
summarise(frequency_of_1 = sum(value == 1)) %>%
#If you need percentage use mean instead of sum
#summarise(frequency_of_1 = mean(value == 1)) %>%
ggplot() + aes(name, frequency_of_1) + geom_col()
In base R you can do this with colSums and barplot.
barplot(colSums(df[-1] == 1))
#For percentage
#barplot(colMeans(df[-1] == 1))

How to search for specific character that has space in the end using sqldf in R

I have a dataset that contains 18 columns and columns are related to an accident being reported. I am trying to find number of accidents that were reported during specific day of the week. For example: total number of accidents reported on Tuesday. For this I am using two variable available in my dataset:Day_of_Week and number_of_vehicle. I am running below sqldf query. However, it is showing me total number of accidents reported during entire week. I would like to use sqldf to report for particular day ex: monday. I would also like to add that there is a fair amount of space in Day_oF_WEEK column.See below for example
Day_OF_Week Number_of_vehicle
MONDAY(Space here) 50
Some sample dataset
> dput(head(myf,5))
structure(list(CASE_NUMBER = c("1363000002", "1296000023", "1283000016",
"1282000006", "1267000007"), BARRACK = c("Rockville", "Berlin",
"Prince Frederick", "Leonardtown", "Essex"), ACC_DATE = c("2012-01-01T00:00:00",
"2012-01-01T00:00:00", "2012-01-01T00:00:00", "2012-01-01T00:00:00",
"2012-01-01T00:00:00"), ACC_TIME = c("2:01", "18:01", "7:01",
"0:01", "1:01"), ACC_TIME_CODE = c("1", "5", "2", "1", "1"),
DAY_OF_WEEK = c("SUNDAY ", "SUNDAY ", "SUNDAY ", "SUNDAY ",
"SUNDAY "), ROAD = c("IS 00495 CAPITAL BELTWAY", "MD 00090 OCEAN CITY EXPWY",
"MD 00765 MAIN ST", "MD 00944 MERVELL DEAN RD", "IS 00695 BALTO BELTWAY"
), INTERSECT_ROAD = c("IS 00270 EISENHOWER MEMORIAL", "CO 00220 ST MARTINS NECK RD",
"CO 00208 DUKE ST", "MD 00235 THREE NOTCH RD", "IS 00083 HARRISBURG EXPWY"
), DIST_FROM_INTERSECT = c("0", "0.25", "100", "10", "100"
), DIST_DIRECTION = c("U", "W", "S", "E", "S"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", "Not Applicable"
), COUNTY_CODE = c("15", "23", "4", "18", "3"), COUNTY_NAME = c("Montgomery",
"Worcester", "Calvert", "St. Marys", "Baltimore"), VEHICLE_COUNT = c("2",
"1", "1", "1", "2"), PROP_DEST = c("YES", "YES", "YES", "YES",
"YES"), INJURY = c("NO", "NO", "NO", "NO", "NO"), COLLISION_WITH_1 = c("VEH",
"FIXED OBJ", "FIXED OBJ", "FIXED OBJ", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "FIXED OBJ", "OTHER-COLLISION", "OTHER-COLLISION"
)), .Names = c("CASE_NUMBER", "BARRACK", "ACC_DATE", "ACC_TIME",
"ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD", "INTERSECT_ROAD", "DIST_FROM_INTERSECT",
"DIST_DIRECTION", "CITY_NAME", "COUNTY_CODE", "COUNTY_NAME",
"VEHICLE_COUNT", "PROP_DEST", "INJURY", "COLLISION_WITH_1", "COLLISION_WITH_2"
), row.names = c(NA, 5L), class = "data.frame")
sqldf Code:
sqldf("select sum(Number_of_vehicle),DAY_OF_WEEK from accident group by DAY_OF_WEEK")
Any help is appreciated!
Thanks in advance!

Convert column types to their read_csv() column type in R

One of my favorite things about library(readr) and the read_csv() function in R is that it almost always sets the column types of my data to the correct class. However, I am currently working with an API in R that returns data to me as a dataframe of all character classes, even if the data is clearly numbers. Take this dataframe for example, which has some sports data:
dput(mydf)
structure(list(isUnplayed = c("false", "false", "false"), isInProgress =
c("false", "false", "false"), isCompleted = c("true", "true", "true"), awayScore = c("106",
"95", "95"), homeScore = c("94", "97", "111"), game.ID = c("31176",
"31177", "31178"), game.date = c("2015-10-27", "2015-10-27",
"2015-10-27"), game.time = c("8:00PM", "8:00PM", "10:30PM"),
game.location = c("Philips Arena", "United Center", "Oracle Arena"
), game.awayTeam.ID = c("88", "86", "110"), game.awayTeam.City = c("Detroit",
"Cleveland", "New Orleans"), game.awayTeam.Name = c("Pistons",
"Cavaliers", "Pelicans"), game.awayTeam.Abbreviation = c("DET",
"CLE", "NOP"), game.homeTeam.ID = c("91", "89", "101"), game.homeTeam.City = c("Atlanta",
"Chicago", "Golden State"), game.homeTeam.Name = c("Hawks",
"Bulls", "Warriors"), game.homeTeam.Abbreviation = c("ATL",
"CHI", "GSW"), quarterSummary.quarter = list(structure(list(
`#number` = c("1", "2", "3", "4"), awayScore = c("25",
"23", "34", "24"), homeScore = c("25", "18", "23", "28"
)), .Names = c("#number", "awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)), structure(list(`#number` = c("1", "2", "3", "4"), awayScore = c("17",
"23", "28", "27"), homeScore = c("26", "20", "25", "26")), .Names = c("#number",
"awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)), structure(list(`#number` = c("1", "2", "3", "4"), awayScore = c("35",
"14", "26", "20"), homeScore = c("39", "20", "35", "17")), .Names = c("#number",
"awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)))), .Names = c("isUnplayed", "isInProgress", "isCompleted",
"awayScore", "homeScore", "game.ID", "game.date", "game.time",
"game.location", "game.awayTeam.ID", "game.awayTeam.City", "game.awayTeam.Name",
"game.awayTeam.Abbreviation", "game.homeTeam.ID", "game.homeTeam.City",
"game.homeTeam.Name", "game.homeTeam.Abbreviation", "quarterSummary.quarter"
), class = "data.frame", row.names = c(NA, 3L))
It is quite a hassle to deal with this dataframe once it is returned by the API, given the class types. I've come up with a sort of a hack to update the column classes, which is as follows:
write_csv(mydf, 'mydf.csv')
mydf <- read_csv('mydf.csv')
By writing to CSV and then re-reading the CSV using read_csv(), the dataframe columns update. Unfortunately I am left with a CSV file in my directory that I don't want. Is there a way to update the columns of an R dataframe to their 'read_csv()' column classes, without actually having to write the CSV?
Any help is appreciated!
You don't need to write and read the data if you just want readr to guess you column type. You could use readr::type_convert for that:
iris %>%
dplyr::mutate(Sepal.Width = as.character(Sepal.Width)) %>%
readr::type_convert() %>%
str()
For comparison:
iris %>%
dplyr::mutate(Sepal.Width = as.character(Sepal.Width)) %>%
str()
try this code, type.convert convert a character vector to logical, integer, numeric, complex or factor as appropriate.
indx <- which(sapply(df, is.character))
df[, indx] <- lapply(df[, indx], type.convert)
indx <- which(sapply(df, is.factor))
df[, indx] <- lapply(df[, indx], as.character)

3D euclidean distance to identify unknown samples

I have this dataframe called mydf where I have three principal covariates (PCA.1,PCA.2, PCA.3). I want to get the 3d distance matrix and get the shortest euclidean distance between all the compared Samples. In another dataframe called myref, I have some known identity of Samples and some unknown samples. By calculating the shortest euclidean distance from mydf, I want to assign the known Identity to the unknown samples. Can someone please help me get this done.
mydf
mydf <- structure(list(Sample = c("1", "2", "4", "5", "6", "7", "8",
"9", "10", "12"), PCA.1 = c(0.00338, -0.020373, -0.019842, -0.019161,
-0.019594, -0.019728, -0.020356, 0.043339, -0.017559, -0.020657
), PCA.2 = c(0.00047, -0.010116, -0.011532, -0.011582, -0.013245,
-0.011751, -0.010299, -0.005801, -0.01, -0.011334), PCA.3 = c(-0.008787,
0.001412, 0.003751, 0.00371, 0.004242, 0.003738, 0.000592, -0.037229,
0.004307, 0.00339)), .Names = c("Sample", "PCA.1", "PCA.2", "PCA.3"
), row.names = c(NA, 10L), class = "data.frame")
myref
myref<- structure(list(Sample = c("1", "2", "4", "5", "6", "7", "8",
"9", "10", "12"), Identity = c("apple", "unknown", "ball", "unknown",
"unknown", "car", "unknown", "cat", "unknown", "dog")), .Names = c("Sample",
"Identity"), row.names = c(NA, 10L), class = "data.frame")
uIX = which(myref$Identity == "unknown")
dMat = as.matrix(dist(mydf[, -1])) # Calculate the Euclidean distance matrix
nn = apply(dMat, 1, order)[2, ] # For each row of dMat order the values increasing values.
# Select nearest neighbor (it is 2, because 1st row will be self)
myref$Identity[uIX] = myref$Identity[nn[uIX]]
Note that the above code will set some identities to unknown. If instead you want to match to the nearest neighbor with a known identity, change the second line to
dMat[uIX, uIX] = Inf

Resources