R normalize a dataset - r

I have a dataset that looks like this
> dput(events.seq)
structure(list(vid = structure(1L, .Label = "2a38ebc2-dd97-43c8-9726-59c247854df5", class = "factor"),
deltas = structure(1L, .Label = "38479,38488,38492,38775,45595,45602,45606,45987,50280,50285,50288,50646,54995,55001,55005,55317,59528,59533,59537,59921,63392,63403,63408,63822,66706,66710,66716,67002,73750,73755,73759,74158,77999,78003,78006,78076,81360,81367,81371,82381,93365,93370,93374,93872,154875,154878,154880,154880,155866,155870", class = "factor"),
events = structure(1L, .Label = "mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown,mouseup,click,mousemove,mousedown", class = "factor")), .Names = c("vid",
"deltas", "events"), class = "data.frame", row.names = c(NA,
-1L))
I need to normalize it to this structure:
> dput(test)
structure(list(vid = structure(c(1L, 1L, 1L), .Label = "2a38ebc2-dd97-43c8-9726-59c247854df5\n+ ", class = "factor"),
delta = c(38479, 38488, 38492), c..mousemove....mousedown....mousup.. = structure(c(2L,
1L, 3L), .Label = c("mousedown", "mousemove", "mousup"), class = "factor")), .Names = c("vid",
"delta", "c..mousemove....mousedown....mousup.."), row.names = c(NA,
-3L), class = "data.frame")
Any help appreciated.
I did try to use strplit, the problem us that I want to split twice at the same time on second and third columns (which are always sync in their length)

Try this:
z <- with(x, data.frame(
deltas = strsplit(as.character(deltas), split = ",")[[1]],
events = strsplit(as.character(events), ",")[[1]]
))
head(z)
The result:
deltas events
1 38479 mousemove
2 38488 mousedown
3 38492 mouseup
4 38775 click
5 45595 mousemove
6 45602 mousedown

Related

How to find a file that has different column type during import (and then don't import it)?

I have a million csv files with the same column headers but somewhere a non.double value has appeared in a file. They all import into a list which I then unlist ok. But when I need to manipulate these columns or perform some operation on them I hit an error which says they can't be converted to type double. How can I find out where which file has the problematic non-numeric entry?
list_of_files<-list.files(pattern="*.csv")
selectCols = c("pax.id","origin","destination","mask.wearing","prevalence","location.time.series","Dose.air","Dose.close","Dose.fomite")
selectCols = which(names(temp) %in% selectCols)
df<- rbindlist(sapply(list_of_files, fread, select = selectCols, simplify = FALSE),
use.names = TRUE, idcol = "FileName")
EDIT:
Based on Ronak Shah's answer below:
df %>%
mutate(across(.fns = ~is.na(suppressWarnings(as.numeric(.))))) %>%
filter(Reduce(`|`, across(.fns = is.na))) %>%
distinct(FileName)
This produces and empty tibble.
Reproducible example:
EXAMPLE FILE with entries:
dput(OKFILE)
structure(list(pax_id = 0, origin = "A", destination = "F", mask_wearing = structure(1L, .Label = c("No",
"Yes"), class = "factor"), prevalence = 0.00025, dose_air = 0.000952123039018727,
dose_fomite = 6.61087941578422e-11, simulation = "ID1", dose_close = 0,
location_time_series1 = "2+", location_time_series2 = "2+",
location_time_series3 = "2+", location_time_series4 = "2+",
location_time_series5 = "2+", location_time_series6 = "none",
location_time_series7 = "none", location_time_series8 = "none",
location_time_series9 = "none"), class = c("data.table",
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x7fcfb00110e0>)
EXAMPLE FILE with a non-numeric row:
dput(OffendingFile)
structure(list(pax.id = structure(1L, .Label = "[-]", class = "factor"),
origin = structure(1L, .Label = "[-]", class = "factor"),
destination = structure(1L, .Label = "[-]", class = "factor"),
time.boarding = structure(1L, .Label = "[hh:mm:ss]", class = "factor"),
time.alighting = structure(1L, .Label = "[hh:mm:ss]", class = "factor"),
is.infectious = structure(1L, .Label = "[-]", class = "factor"),
mask.wearing = structure(1L, .Label = "[-]", class = "factor"),
seed = structure(1L, .Label = "[-]", class = "factor"), prevalence = structure(1L, .Label = "[-]", class = "factor"),
source.strength = structure(1L, .Label = "[virus s-1]", class = "factor"),
close.range.strength = structure(1L, .Label = "[virus s-1]", class = "factor"),
boarding.surfaces.touched = structure(1L, .Label = "[-]", class = "factor"),
alighting.surfaces.touched = structure(1L, .Label = "[-]", class = "factor"),
initial.hand.concentration = structure(1L, .Label = "[virus m-2]", class = "factor"),
final.hand.concentration = structure(1L, .Label = "[virus m-2]", class = "factor"),
location.time.series = structure(1L, .Label = "[-]", class = "factor"),
time.in.range.0.1.m = structure(1L, .Label = "[s]", class = "factor"),
time.in.range.1.2.m = structure(1L, .Label = "[s]", class = "factor"),
Dose.air = structure(1L, .Label = "[virus]", class = "factor"),
Dose.close = structure(1L, .Label = "[virus]", class = "factor"),
Dose.fomite = structure(1L, .Label = "[virus]", class = "factor"),
Probability.of.Infection = structure(1L, .Label = "[-]", class = "factor")), class = "data.frame", row.names = c(NA,
-1L))
Here is a one way of getting the names of the file that won't convert.
library(dplyr)
#Include all the columns that we want to check for being numeric
selectCols = c("origin","destination","mask.wearing","prevalence","Dose.air","Dose.close","Dose.fomite")
#Read them into one dataframe
df <- bind_rows(sapply(list_of_files, data.table::fread,
select = selectCols, simplify = FALSE), .id = "FileName")
Once we have the data in one dataframe we turn all the values to numeric and select the rows that don't turn to numeric. We select only the unique filenames from it.
df %>%
mutate(across(.fns = ~is.na(suppressWarnings(as.numeric(.))))) %>%
filter(Reduce(`|`, across(.fns = is.na))) %>%
distinct(FileName)

How do I change the names of columns in multiple dataframes using a mapping file in R?

I have a script that loops through multiple years of data, one year at a time. Each year of data consists of multiple dataframes that are placed in a list called all_input. At the beginning of the loop (after the data is read in), I am trying to get all of the years of data in the same format before the rest of the processing.
The issue I am having is with column names, which are not uniform.
There are 5 columns included in each dataframe that I want to keep, and I want them to be called total_emissions uom tribal_name st_usps_cd and description. In some dataframes they already have these names, while in others they have various names such as pollutant.desc or pollutant_desc, for example.
My current approach is this:
# Create a mapping file for the column names
header_map <- data.frame(orignal_col = c( "pollutant_desc", "pollutant.desc", "emissions.uom", "total.emissions", "tribal.name", "state" ),
new_col = c( "description", "description", "uom", "total_emissions", "tribal_name", "st_usps_cd" ), stringsAsFactors = FALSE)
# change the column names
lapply(all_input, function(x) {
names(x)[match(header_map$orignal_col, names(x))] <- header_map$new_col
x
}) -> all_input
Which creates a header mapping file that looks like this:
original_col new_col
pollutant_desc description
pollutant.desc description
emissions.uom uom
total.emissions total_emissions
tribal.name tribal_name
state st_usps_cd
The error I am getting is as follows:
Error in names(x)[match(header_map$orignal_col, names(x))] <- header_map$new_col :
NAs are not allowed in subscripted assignments
I understand that as I will have to manually add entries to the header file as new years of data with different column names are processed, but how can I get this to work?
Fake Sample Data. df1 and df2 represent the format of the "2017" data, where multiple columns need name changes, but the current names are consistent between dataframes. df3 represents "2011" data, where all of the column names are as they should be. df4 represents "2014" data, where the only column that needs to be changed is pollutant_desc. Note, there are extra columns in each dataframe that are not needed and can be ignored. And reminder, these dataframes are not all read at the same time. The loop is by year, so df1 and df2 (in list all_input) will be formatted and processed. Then all of the data is removed, and a new all_input list is created with the next years dataframes, which will have different column names. The code must work for all years without being changed.
> dput(df1)
structure(list(total.emissions = structure(1:2, .Label = c("100",
"300"), class = "factor"), emissions.uom = structure(1:2, .Label = c("LB",
"TON"), class = "factor"), international = c(TRUE, TRUE), hours = structure(2:1, .Label = c("17",
"3"), class = "factor"), tribal.name = structure(2:1, .Label = c("FLLK",
"SUWJG"), class = "factor"), state = structure(1:2, .Label = c("AK",
"MN"), class = "factor"), pollutant.desc = structure(1:2, .Label = c("Methane",
"NO2"), class = "factor"), policy = c(TRUE, FALSE)), class = "data.frame", row.names = c(NA,
-2L))
> dput(df2)
structure(list(total.emissions = structure(2:1, .Label = c("20",
"400"), class = "factor"), emissions.uom = structure(c(1L, 1L
), .Label = "TON", class = "factor"), international = c(FALSE,
TRUE), hours = structure(2:1, .Label = c("1", "8"), class = "factor"),
tribal.name = structure(2:1, .Label = c("SOSD", "WMFJU"), class = "factor"),
state = structure(2:1, .Label = c("SD", "WY"), class = "factor"),
pollutant.desc = structure(1:2, .Label = c("CO2", "SO2"), class = "factor"),
policy = c(FALSE, FALSE)), class = "data.frame", row.names = c(NA,
-2L))
> dput(df3)
structure(list(total_emissions = structure(2:1, .Label = c("200",
"30"), class = "factor"), uom = structure(c(1L, 1L), .Label = "TON", class = "factor"),
boundaries = structure(2:1, .Label = c("N", "Y"), class = "factor"),
tribal_name = structure(2:1, .Label = c("SOSD", "WMFJU"), class = "factor"),
st_usps_cd = structure(2:1, .Label = c("ID", "KS"), class = "factor"),
description = structure(c(1L, 1L), .Label = "SO2", class = "factor"),
policy = c(FALSE, TRUE), time = structure(1:2, .Label = c("17",
"7"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
> dput(df4)
structure(list(total_emissions = structure(2:1, .Label = c("700",
"75"), class = "factor"), uom = structure(c(1L, 1L), .Label = "LB", class = "factor"),
tribal_name = structure(1:2, .Label = c("SSJY", "WNCOPS"), class = "factor"),
st_usps_cd = structure(1:2, .Label = c("MO", "NY"), class = "factor"),
pollutant_desc = structure(2:1, .Label = c("CO2", "Methane"
), class = "factor"), boundaries = structure(c(1L, 1L), .Label = "N", class = "factor"),
policy = c(FALSE, FALSE), time = structure(1:2, .Label = c("2",
"3"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
Thank you!
Try this:
list_of_frames1 <- list(df1, df2, df3, df4)
list_of_frames2 <- lapply(list_of_frames1, function(x) {
nms <- intersect(names(x), header_map$orignal_col)
names(x)[ match(nms, names(x)) ] <- header_map$new_col[ match(nms, header_map$orignal_col) ]
x
})

Extract common values for more than one column between several R dataframes

Imagine I have this 4 data frames:
abc_df
abc_ID . abc_classification
a . neutral
b . deletereous
c . benign
def_df
def_ID . def_classification
f . neutral
a . neutral
c . benign
ghi_df
ghi_ID . ghi_classification
f . deletereous
c . benign
k . neutral
vmk_df
vmk_ID . vmk_classification
c . benign
k . deletereous
a . neutral
As you can see, the columns "dfname_ID" and "dfname_classification" are not contiguous (the dots represent another columns in the data frame) and have not the same colnames. So, I would like to extract the common rows between all data frames for these 2 columns, using the index of the columns, and not their names.
The output should be this:
ID . classification
c . benign
I am trying to use intersect, lapply(mget(c('abc_df', 'def_df', 'ghi_df', 'vmk_df'))), but I don't know how to specify the correct command. Do you know how can I solve this?
Might need to use purrr, so the conversion to character might not be necessary since intersect forces it to change:
library(purrr)
library(magrittr)
COLUMNS = c(1,2,3)
list(abc_df,def_df,ghi_df,vmk_df) %>%
map(~mutate_if(.x[,COLUMNS],is.factor, as.character)) %>%
map(~set_colnames(.x,c("id",".","classification"))) %>%
reduce(intersect)
id . classification
1 c . benign
Your data:
abc_df = structure(list(abc_ID = structure(1:3, .Label = c("a", "b", "c"
), class = "factor"), . = structure(c(1L, 1L, 1L), .Label = ".", class = "factor"),
abc_classification = structure(3:1, .Label = c("benign",
"deletereous", "neutral"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
def_df = structure(list(def_ID = structure(c(3L, 1L, 2L), .Label = c("a",
"c", "f"), class = "factor"), . = structure(c(1L, 1L, 1L), .Label = ".", class = "factor"),
def_classification = structure(c(2L, 2L, 1L), .Label = c("benign",
"neutral"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
ghi_df = structure(list(ghi_ID = structure(c(2L, 1L, 3L), .Label = c("c",
"f", "k"), class = "factor"), . = structure(c(1L, 1L, 1L), .Label = ".", class = "factor"),
ghi_classification = structure(c(2L, 1L, 3L), .Label = c("benign",
"deletereous", "neutral"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
vmk_df = structure(list(vmk_ID = structure(c(2L, 3L, 1L), .Label = c("a",
"c", "k"), class = "factor"), . = structure(c(1L, 1L, 1L), .Label = ".", class = "factor"),
vmk_classification = structure(1:3, .Label = c("benign",
"deletereous", "neutral"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
For the data you provided you could use:
library(dplyr)
abc_df %>%
rename(ID = abc_ID, classification = abc_classification) %>%
inner_join(def_df, by = c("ID" = "def_ID",
"classification" = "def_classification")) %>%
inner_join(ghi_df, by = c("ID" = "ghi_ID",
"classification" = "ghi_classification")) %>%
inner_join(jkl_df, by = c("ID" = "jkl_ID",
"classification" = "jkl_classification"))

Creating a nested list obect

I have a dataframe as shown below ( 1st row column names, 2nd row data elements)
From
Col_Name Col_Child_1 Col_Grand_Child_1 Col_Great_Grand_Child_1 Col_Great_Grand_Child_Size1 Col_Great_Grand_Child_2 Col_Great_Grand_Child_Size2 Col_Great_Grand_Child_3 Col_Great_Grand_Child_Size3 Col_Great_Grand_Child_4 Col_Great_Grand_Child_Size4
Flare analytics cluster AgglomerativeCluster 3938 CommunityStructure 3812 HierarchicalCluster 6714 MergeEdge 743
I am trying to convert the data elements in the second row ( Flare, analytics....) into a nested list as shown below
> Flare
$name
[1] "flare"
$children
$children[[1]]
$children[[1]]$name
[1] "analytics"
$children[[1]]$children
$children[[1]]$children[[1]]
$children[[1]]$children[[1]]$name
[1] "cluster"
$children[[1]]$children[[1]]$children
$children[[1]]$children[[1]]$children[[1]]
$children[[1]]$children[[1]]$children[[1]]$name
[1] "AgglomerativeCluster"
$children[[1]]$children[[1]]$children[[1]]$size
[1] 3938
$children[[1]]$children[[1]]$children[[2]]
$children[[1]]$children[[1]]$children[[2]]$name
[1] "CommunityStructure"
$children[[1]]$children[[1]]$children[[2]]$size
[1] 3812
$children[[1]]$children[[1]]$children[[3]]
$children[[1]]$children[[1]]$children[[3]]$name
[1] "HierarchicalCluster"
$children[[1]]$children[[1]]$children[[3]]$size
[1] 6714
$children[[1]]$children[[1]]$children[[4]]
$children[[1]]$children[[1]]$children[[4]]$name
[1] "MergeEdge"
$children[[1]]$children[[1]]$children[[4]]$size
[1] 743
My attempts so far are very elementary and reflects my struggle :).
x = list(name= test1$Col_Name, children = c(test1$Col_Child_1)
Any help on solving this puzzle is much appreciated. Thanks in advance.
Below is the output from dput(test1)
test1 = structure(list(Col_Name = structure(2L, .Names = "row1", .Label = c("Col_Name",
"Flare"), class = "factor"), Col_Child_1 = structure(1L, .Names = "row1", .Label = c("analytics",
"Col_Child_1"), class = "factor"), Col_Grand_Child_1 = structure(1L, .Names = "row1", .Label = c("cluster",
"Col_Grand_Child_1"), class = "factor"), Col_Great_Grand_Child_1 = structure(1L, .Names = "row1", .Label = c("AgglomerativeCluster",
"Col_Great_Grand_Child_1"), class = "factor"), Col_Great_Grand_Child_Size1 = structure(1L, .Names = "row1", .Label = c("3938",
"Col_Great_Grand_Child_Size1"), class = "factor"), Col_Great_Grand_Child_2 = structure(2L, .Names = "row1", .Label = c("Col_Great_Grand_Child_2",
"CommunityStructure"), class = "factor"), Col_Great_Grand_Child_Size2 = structure(1L, .Names = "row1", .Label = c("3812",
"Col_Great_Grand_Child_Size2"), class = "factor"), Col_Great_Grand_Child_3 = structure(2L, .Names = "row1", .Label = c("Col_Great_Grand_Child_3",
"HierarchicalCluster"), class = "factor"), Col_Great_Grand_Child_Size3 = structure(1L, .Names = "row1", .Label = c("6714",
"Col_Great_Grand_Child_Size3"), class = "factor"), Col_Great_Grand_Child_4 = structure(2L, .Names = "row1", .Label = c("Col_Great_Grand_Child_4",
"MergeEdge"), class = "factor"), Col_Great_Grand_Child_Size4 = structure(1L, .Names = "row1", .Label = c("743",
"Col_Great_Grand_Child_Size4"), class = "factor")), .Names = c("Col_Name",
"Col_Child_1", "Col_Grand_Child_1", "Col_Great_Grand_Child_1",
"Col_Great_Grand_Child_Size1", "Col_Great_Grand_Child_2", "Col_Great_Grand_Child_Size2",
"Col_Great_Grand_Child_3", "Col_Great_Grand_Child_Size3", "Col_Great_Grand_Child_4",
"Col_Great_Grand_Child_Size4"), row.names = 2L, class = "data.frame")
the code below does not generalise well, so watch out and good luck with it :)
## get rid of factors
dat <- data.frame(lapply(dat, as.character), stringsAsFactors=FALSE)
## identify descendants -- hoping that the great grand children do not reproduce further
Children.names <- grep('Col_Child_[[:digit:]]', colnames(dat))
Grand_Children.names <- grep('Col_Grand_Child_[[:digit:]]', colnames(dat))
Great_Grand_Children.names <- grep('Col_Great_Grand_Child_[[:digit:]]', colnames(dat))
Great_Grand_Children.sizes <- grep('Col_Great_Grand_Child_Size[[:digit:]]', colnames(dat))
## putting it together into a list of lists (of lists)
nggc <- length(Great_Grand_Children.sizes)
ggc <- lapply(1:nggc, function(i) list(name=dat[1,Great_Grand_Children.names[i]], size=as.numeric(dat[Great_Grand_Children.sizes[i]])))
gc <- list(name=dat[1,Grand_Children.names[1]], children=ggc)
## fingers crossed now...
ll <- list(name=dat$Col_Name)
ll$children <- list( list(name=dat[1,Children.names[1]], children=gc) )

Collapse and aggregate several row values by date

I've got a data set that looks like this:
date, location, value, tally, score
2016-06-30T09:30Z, home, foo, 1,
2016-06-30T12:30Z, work, foo, 2,
2016-06-30T19:30Z, home, bar, , 5
I need to aggregate these rows together, to obtain a result such as:
date, location, value, tally, score
2016-06-30, [home, work], [foor, bar], 3, 5
There are several challenges for me:
The resulting row (a daily aggregate) must include the rows for this day (2016-06-30 in my above example
Some rows (strings) will result in an array containing all the values present on this day
Some others (ints) will result in a sum
I've had a look at dplyr, and if possible I'd like to do this in R.
Thanks for your help!
Edit:
Here's a dput of the data
structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat<-structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat$date <- as.Date(mydat$date)
require(data.table)
mydat.dt <- data.table(mydat)
mydat.dt <- mydat.dt[, lapply(.SD, paste0, collapse=" "), by = date]
cbind(mydat.dt, aggregate(mydat[,c("tally", "score")], by=list(mydat$date), FUN = sum, na.rm=T)[2:3])
which gives you:
date location value tally score
1: 2016-06-30 home work home foo foo bar 3 5
Note that if you wanted to you could probably do it all in one step in the reshaping of the data.table but I found this to be a quicker and easier way for me to achieve the same thing in 2 steps.

Resources