Count rows in R data.table [duplicate] - r

This question already has answers here:
Count number of rows per group and add result to original data frame
(11 answers)
Closed 7 years ago.
For a sample dataframe:
library(data.table)
df = structure(list(country = c("AT", "AT", "AT", "BE", "BE", "BE",
"DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE",
"DE", "DE", "DE"), level = c("1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"
), region = c("AT2", "AT1", "AT3", "BE2", "BE1", "BE3", "DE4",
"DE3", "DE9", "DE7", "DE1", "DEE", "DEG", "DE2", "DED", "DEB",
"DEA", "DEF", "DE6", "DE8"), N = c("348", "707", "648", "952",
"143", "584", "171", "155", "234", "176", "302", "144", "148",
"386", "257", "126", "463", "74", "44", "119"), result = c("24.43",
"26.59", "20.37", "23.53", "16.78", "25.51", "46.2", "43.23",
"41.03", "37.5", "33.44", "58.33", "47.97", "34.46", "39.69",
"31.75", "36.93", "43.24", "36.36", "43.7")), .Names = c("country",
"level", "region", "N", "result"), class = c("data.table", "data.frame"
), row.names = c(NA, -20L))
I am using the following code to produce a summary table:
variable.country <-setDT(variable.regions)[order(country), list(min_result = min(result),
max_result = max(result), level= level[1L]), by = country]
I simply want to add another variable to this data table which allows me to know how many regions i.e. rows there are in each country (i.e. AT has 3) - how would I get length or dim to work under these circumstances?
Thanks.

We can use .N to get the length per each 'country'
setDT(variable.regions)[order(country),
list(min_result = min(result),
len = .N,
max_result = max(result),
level= level[1L]),
by = country]

Related

Replace value with NULL in column [duplicate]

This question already has an answer here:
Set NA and "" Cells in R Dataframe to NULL
(1 answer)
Closed 4 years ago.
I have a dataframe where I want to replace all values in a column that contain the value '2018' with NULL.
I have a dataset where every value in a column is a list. There are NULLs included as well. One of the values is not a list and I want to replace it with a NULL. If I replace it with NA then the datatypes in that column are mixed.
If I have a column like below, how do I replace the value containing 2018 with NULL instead of NA?
spend actions
176.2 2018-02-24
166.66 list(action_type = c("landing_page_view", "link_click", "offsit...
153.89 list(action_type = c("landing_page_view", "like", "link_click",...
156.54 list(action_type = c("landing_page_view", "like", "link_click",...
254.95 list(action_type = c("landing_page_view", "like", "link_click",...
374 list(action_type = c("landing_page_view", "like", "link_click",...
353.29 list(action_type = c("landing_page_view", "like", "link_click",...
0.41 NULL
Reproducible Example:
structure(list(spend = c("176.2", "166.66", "153.89", "156.54",
"254.95", "374", "353.29", "0.41"), actions = list("2018-02-24",
structure(list(action_type = c("landing_page_view", "link_click",
"offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content",
"post", "post_reaction", "page_engagement", "post_engagement",
"offsite_conversion"), value = c("179", "275", "212", "18",
"269", "1434", "1", "17", "293", "293", "1933")), .Names = c("action_type",
"value"), class = "data.frame", row.names = c(NA, 11L)),
structure(list(action_type = c("landing_page_view", "like",
"link_click", "offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content", "post_reaction",
"page_engagement", "post_engagement", "offsite_conversion"
), value = c("136", "3", "248", "101", "6", "237", "730",
"11", "262", "259", "1074")), .Names = c("action_type", "value"
), class = "data.frame", row.names = c(NA, 11L)), structure(list(
action_type = c("landing_page_view", "like", "link_click",
"offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content",
"post", "post_reaction", "page_engagement", "post_engagement",
"offsite_conversion"), value = c("95", "1", "156", "91",
"5", "83", "532", "1", "13", "171", "170", "711")), .Names =
c("action_type",
"value"), class = "data.frame", row.names = c(NA, 12L)),
structure(list(action_type = c("landing_page_view", "like",
"link_click", "offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content", "post_reaction",
"page_engagement", "post_engagement", "offsite_conversion"
), value = c("178", "4", "243", "56", "4", "138", "437",
"19", "266", "262", "635")), .Names = c("action_type", "value"
), class = "data.frame", row.names = c(NA, 11L)), structure(list(
action_type = c("landing_page_view", "like", "link_click",
"offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content",
"post_reaction", "page_engagement", "post_engagement",
"offsite_conversion"), value = c("203", "2", "306", "105",
"7", "186", "954", "23", "331", "329", "1252")), .Names =
c("action_type",
"value"), class = "data.frame", row.names = c(NA, 11L)),
structure(list(action_type = c("landing_page_view", "like",
"link_click", "offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content", "post", "post_reaction",
"page_engagement", "post_engagement", "offsite_conversion"
), value = c("241", "4", "320", "106", "3", "240", "789",
"1", "17", "342", "338", "1138")), .Names = c("action_type",
"value"), class = "data.frame", row.names = c(NA, 12L)),
NULL)), .Names = c("spend", "actions"), row.names = c(NA,
-8L), class = "data.frame")
My ultimate goal is to use this function with this dataset to make the action_types their own column. This function works when either a list or NULL is in the actions column:
fb_insights_all<-df %>%
as.tibble() %>%
filter(!map_lgl(actions, is.null)) %>%
unnest() %>%
right_join(select(df, -actions)) %>%
spread(action_type, value)
Error: Each column must either be a list of vectors or a list of data frames [actions]
Without data to test this on, I'd try:
df$COL1<-ifelse(grepl("2018", df$COL1),"NULL",df$COL1)
As stated here NA functions more like what you seem to be trying to do, while NULL serves a different function. If you just want the value to just say "NULL" rather than function like NULL, treat it like a character value.

as.numeric creates hanging decimal

Curious whats causing this behavior. Reading in an excel file with numbers stored as text. When I convert to numeric, integers get a decimal point added (i.e. 153 becomes 153.) No 0, nothing, just a hanging decimal. Any idea on how to correct this?
structure(list(`Nest ID` = c("21Lk", "21Lk", "21Lk", "21Lk",
"A-Frye"), `Clutch size` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), `hatch size` = c("4", "4", "4",
"4", "7"), `hatch date` = c("146", "146", "146", "146", "153"
), Date = c("149", "167", "188", "247", "161"), Time = c("900",
"900", "1200", "1224", "1538"), Cygnets = c("4", "3", NA, NA,
"7"), detection = c("1", "1", "0", "0", "1"), `Age when found (days)` =
c("3",
"3", "3", "3", "8"), `Age at time of observation (days)` = c("3",
"21", "42", "101", "8"), `prob of complete brood loss` = c("0.5",
"0.5", "0.5", "0.5", "0.1"), `Predator fish present (1 likely, 2 unknown, 0
unlikely)` = c("1",
"1", "1", "1", "1"), `Wetland Area (NWI) (ha)` = c("109.4938966",
"109.4938966", "109.4938966", "109.4938966", "34.923899399999996"
), Notes = c(NA, NA, "unsure, could have been in the deep veg",
"unsure, could have been in the deep veg", NA)), .Names = c("Nest ID",
"Clutch size", "hatch size", "hatch date", "Date", "Time", "Cygnets",
"detection", "Age when found (days)", "Age at time of observation (days)",
"prob of complete brood loss", "Predator fish present (1 likely, 2 unknown,
0 unlikely)",
"Wetland Area (NWI) (ha)", "Notes"), row.names = c(NA, -5L), class =
c("tbl_df",
"tbl", "data.frame"))
and the code used to convert/cleanup
library(tidyverse)
test.2<-test%>%
rename(NestID=`Nest ID`,
CS=`Clutch size`,
HS=`hatch size`,
HD=`hatch date`,
DOY=Date,
Age.firstobs=`Age when found (days)`,
Age.current=`Age at time of observation (days)`,
is.broodloss=`prob of complete brood loss`,
pred.fish=`Predator fish present (1 likely, 2 unknown, 0 unlikely)`,
WA=`Wetland Area (NWI) (ha)`)%>%
mutate_at(vars(CS:DOY,Cygnets,Age.firstobs,Age.current,pred.fish),funs(as.numeric))%>%
mutate_at(vars(detection,is.broodloss,WA),funs(as.numeric))

Drawing slope graph in R using ggplot, Error: Aesthetics must be either length 1 or the same as the data

I want to create a slope graph in R like this using ggplot
https://rud.is/b/2013/01/11/slopegraphs-in-r/
after cleaning the data and melt the data frame i ran into an error like this:
Error: Aesthetics must be either length 1 or the same as the data (182): x, y, group, colour, label
There's no NAs in my data. Any ideas? Much appreciated!
Here's the code
#Read file as numeric data
betterlife<-read.csv("betterlife.csv",skip=4,stringsAsFactors = F)
num_data <- data.frame(data.matrix(betterlife))
numeric_columns <- sapply(num_data,function(x){mean(as.numeric(is.na(x)))<0.5})
final_data <- data.frame(num_data[,numeric_columns],
betterlife[,!numeric_columns])
## rescale selected columns data frame
final_data <- data.frame(lapply(final_data[,c(3,4,5,6,7,10,11)], function(x) scale(x, center = FALSE, scale = max(x, na.rm = TRUE)/100)))
## Add country names as indicator
final_data["INDICATOR"] <- NA
final_data$INDICATOR <- betterlife$INDICATOR
employment.data <- final_data[5:30,]
indicator <- employment.data$INDICATOR
## Melt data to draw graph
employment.melt <- melt(employment.data)
#plot
sg = ggplot(employment.melt, aes(factor(variable), value,
group = indicator,
colour = indicator,
label = indicator)) +
theme(legend.position = "none",
axis.text.x = element_text(size=5),
axis.text.y=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.ticks=element_blank(),
axis.line=element_blank(),
panel.grid.major.x = element_line("black", size = 0.1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_blank())
sg1
This is the data I'm working with
dput(betterlife)
structure(list(X = c("", "ISO3", "AUS", "AUT", "BEL", "CAN",
"CHL", "CZE", "DNK", "EST", "FIN", "FRA", "DEU", "GRC", "HUN",
"ISL", "IRL", "ISR", "ITA", "JPN", "KOR", "LUX", "MEX", "NLD",
"NZL", "NOR", "POL", "PRT", "SVK", "SVN", "ESP", "SWE", "CHE",
"TUR", "GBR", "USA", "OECD", "", ""),
INDICATOR = c("UNIT", "COUNTRY",
"Australia", "Austria", "Belgium", "Canada", "Chile", "Czech Republic",
"Denmark", "Estonia", "Finland", "France", "Germany", "Greece",
"Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan",
"Korea", "Luxembourg", "Mexico", "Netherlands", "New Zealand",
"Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia",
"Spain", "Sweden", "Switzerland", "Turkey", "United Kingdom",
"United States", "OECD average", "", "n.a. : not available"),
Rooms.per.person = c("Average number of rooms shared per person in a dwelling",
"", "2.4", "1.7", "2.3", "2.5", "1.3", "1.3", "1.9", "1.2",
"1.9", "1.8", "1.7", "1.2", "1", "1.6", "2.1", "1.1", "1.4",
"1.8", "1.3", "1.9", "1.566666667", "2", "2.3", "1.9", "1",
"1.5", "1.1", "1.1", "1.9", "1.8", "1.7", "0.7", "1.8", "1.605208333",
"1.6", "", ""),
Dwelling.without.basic.facilities = c("% of people without indoor flushing toilets in their home",
"", "3.425714286", "1.3", "0.6", "2.722", "9.36", "0.7",
"0", "12.2", "0.8", "0.8", "1.2", "1.8", "7.1", "0.3", "0.3",
"2.52", "0.2", "6.4", "7.46", "0.8", "6.6", "0", "2.984285714",
"0.1", "4.8", "2.4", "1.1", "0.6", "0", "0", "0.1", "17.1",
"0.5", "0", "2.82", "", ""),
Household.disposable.income = c("USD (PPPs adjusted)",
"", "27,039", "27,670", "26,008", "27,015", "8,712", "16,690",
"22,929", "13,486", "24,246", "27,508", "27,665", "21,499",
"13,858", "19,621", "24,313", "22,539", "24,383", "23,210",
"16,254", "19,621", "12,182", "25,977", "18,819", "29,366",
"13,811", "18,540", "15,490", "19,890", "22,972", "26,543",
"27,542", "21,030", "27,208", "37,685", "22,284", "", ""),
Employment.rate = c("% of the working age population (15-64)",
"", "72.3", "71.73", "62.01", "71.68", "59.32", "65", "73.44",
"61.02", "68.15", "63.99", "71.1", "59.55", "55.4", "78.17",
"59.96", "59.21", "56.89", "70.11", "63.31", "65.21", "60.39",
"74.67", "72.34", "75.31", "59.26", "65.55", "58.76", "66.2",
"58.55", "72.73", "78.59", "46.29", "69.51", "66.71", "64.52",
"", ""),
Long.term.unemployment.rate = c("% of people, aged 15-64, who are not working but have been actively seeking a job for over a year",
"", "1", "1.13", "4.07", "0.97", "2.98375", "3.19", "1.44",
"7.84", "2.01", "3.75", "3.4", "5.73", "5.68", "1.35", "6.74",
"1.85", "4.13", "1.99", "0.01", "1.29", "0.13", "1.24", "0.6",
"0.34", "2.49", "5.97", "8.56", "3.21", "9.1", "1.42", "1.49",
"3.11", "2.59", "2.85", "2.74", "", ""),
Quality.of.support.network = c("% of people who have friends or relatives to rely on in case of need",
"", "95.4", "94.6", "92.6", "95.3", "85.2", "88.9", "96.8",
"84.6", "93.4", "93.9", "93.5", "86.1", "88.6", "97.6", "97.3",
"93", "86", "89.7", "79.8", "95", "87.1", "94.8", "97.1",
"93.1", "92.2", "83.3", "89.6", "90.7", "94.1", "96.2", "93.2",
"78.8", "94.9", "92.3", "91.1", "", ""),
Educational.attainment = c("% of people, aged 15-64, having at least an upper-secondary (high-school) degree",
"", "69.72", "81.04", "69.58", "87.07", "67.97", "90.9",
"74.56", "88.48", "81.07", "69.96", "85.33", "61.07", "79.7",
"64.13", "69.45", "81.23", "53.31", "87", "79.14", "67.94",
"33.55", "73.29", "72.05", "80.7", "87.15", "28.25", "89.93",
"82.04", "51.23", "85.04", "86.81", "30.31", "69.63", "88.7",
"72.95", "", ""),
Students.reading.skills = c("Average reading performance of students aged 15, according to PISA",
"", "515", "470", "506", "524", "449", "478", "495", "501",
"536", "496", "497", "483", "494", "500", "496", "474", "486",
"520", "539", "472", "425", "508", "521", "503", "500", "489",
"477", "483", "481", "497", "501", "464", "494", "500", "493",
"", ""),
Air.pollution = c("Average concentration of particulate matter (PM10) in cities with population larger than 100 000, measured in micrograms per cubic meter",
"", "14.28", "29.03", "21.27", "15", "61.55", "18.5", "16.26",
"12.62", "14.87", "12.94", "16.21", "32", "15.6", "14.47",
"12.54", "27.57", "23.33", "27.14", "30.76", "12.63", "32.69",
"30.76", "11.93", "15.85", "35.07", "21", "13.14", "29.03",
"27.56", "10.52", "22.36", "37.06", "12.67", "19.4", "21.99",
"", ""),
Consultation.on.rule.making = c("Composite index, increasing with the number of key elements of formal consultation processes",
"", "10.5", "7.13", "4.5", "10.5", "2", "6.75", "7", "3.25",
"9", "3.5", "4.5", "6.5", "7.88", "5.13", "9", "2.5", "5",
"7.25", "10.38", "6", "9", "6.13", "10.25", "8.13", "10.75",
"6.5", "6.63", "10.25", "7.25", "10.88", "8.38", "5.5", "11.5",
"8.25", "7.28", "", ""),
Voter.turnout = c("Number of people voting as % of the registered population ",
"", "95", "82", "91", "60", "88", "64", "87", "62", "74",
"84", "78", "74", "64", "84", "67", "65", "81", "67", "63",
"57", "59", "80", "79", "77", "54", "64", "55", "63", "75",
"82", "48", "84", "61", "90", "72", "", ""),
Life.expectancy = c("Average number of years a person can expect to live",
"", "81.5", "80.5", "79.8", "80.7", "77.8", "77.3", "78.8",
"73.9", "79.9", "81", "80.2", "80", "73.8", "81.3", "79.9",
"81.1", "81.5", "82.7", "79.9", "80.6", "75.1", "80.2", "80.4",
"80.6", "75.6", "79.3", "74.8", "78.8", "81.2", "81.2", "82.2",
"73.6", "79.7", "77.9", "79.2", "", ""),
Self.reported.health = c("% of people reporting their health to be \"good or very good\"",
"", "84.9", "69.6", "76.7", "88.1", "56.2", "68.2", "74.3",
"56.3", "67.7", "72.4", "64.7", "76.4", "55.2", "80.6", "84.4",
"79.7", "63.4", "32.7", "43.7", "74", "65.5", "80.6", "89.7",
"80", "57.7", "48.6", "31.1", "58.8", "69.8", "79.1", "80.95",
"66.8", "76", "88", "69", "", ""),
Life.Satisfaction = c("Average self-evaluation of life satisfaction, on a scale from 0 to 10",
"", "7.5", "7.3", "6.9", "7.7", "6.6", "6.2", "7.8", "5.1",
"7.4", "6.8", "6.7", "5.8", "4.7", "6.9", "7.3", "7.4", "6.4",
"6.1", "6.1", "7.1", "6.8", "7.5", "7.2", "7.6", "5.8", "4.9",
"6.1", "6.1", "6.2", "7.5", "7.5", "5.5", "7", "7.2", "6.7",
"", ""),
Homicide.rate = c("Average number of reported homicides per 100 000 people",
"", "1.2", "0.5", "1.8", "1.7", "8.1", "2", "1.4", "6.3",
"2.5", "1.4", "0.8", "1.1", "1.5", "0", "2", "2.4", "1.2",
"0.5", "2.3", "1.5", "11.6", "1", "1.3", "0.6", "1.2", "1.2",
"1.7", "0.5", "0.9", "0.9", "0.7", "2.9", "2.6", "5.2", "2.1",
"", ""),
Assault.rate = c("% of people who report having been assaulted in the previous year",
"", "2.1", "3", "7.3", "1.4", "9.5", "3.5", "3.9", "6.2",
"2.4", "4.9", "3.6", "3.8", "3.8", "2.7", "2.7", "3.1", "4.7",
"1.6", "2.1", "4.3", "14.8", "5", "2.3", "3.3", "2.2", "6.2",
"3.5", "3.9", "4.2", "5.2", "4.2", "6", "1.9", "1.6", "4.1",
"", "")),
.Names = c("X", "INDICATOR", "Rooms.per.person", "Dwelling.without.basic.facilities",
"Household.disposable.income", "Employment.rate",
"Long.term.unemployment.rate", "Quality.of.support.network",
"Educational.attainment", "Students.reading.skills", "Air.pollution",
"Consultation.on.rule.making", "Voter.turnout", "Life.expectancy",
"Self.reported.health", "Life.Satisfaction", "Homicide.rate",
"Assault.rate"), class = "data.frame", row.names = c(NA, -39L))
Did I melt the data frame wrongly? since the index of each row are not in the correct order

Rule Learning using SBRL in R

I'm trying to use the Scalable Bayesian Rule Lists Model for creating some rule lists in R.
Link to package: SBRL Package R
I read data into a list, split into train and test and plug into the function
sbrl_model <- sbrl(data_train,iters=20000, pos_sign="1", neg_sign="0",)
which gives me the following error:
Error in asMethod(object) :
column(s) 1, 2, 4, 6 not logical or a factor. Discretize the columns first.
When I convert the data_train into a factor and try using:
data_train <- sapply(data_train, as.factor)
sbrl_model <- sbrl::sbrl(data_train, iters=20000, pos_sign="1", neg_sign="0",)
I get the following error:
Error in data_train$label : $ operator is invalid for atomic vectors
My data has the following columns:
state, amounts, timestamp, code, risk, vendor, label
The label is 0 or 1. I need to create rules for detecting what data leads to a 1.
I'm new to R so this seems confusing. If I don't convert to factors, it complains, if I do it can't use the "$" operator. Any ideas what I'm doing wrong? Thank you
> dput(data_train)
structure(c("PR", "PR", "PR", "PR", "MA", "MA", "NH", "NH", "ME",
"ME", "ME", "VT", "VT", "CT", "CT", "NJ", "NJ", "NY", "NY", "NY",
"NY", "NY", "NY", "NY", "PA", "PA", "PA", "PA", "PA", "PA", "PA",
"PA", "PA", "DE", "VA", "VA", "VA", "WV", "WV", "WV", "WV", "WV",
"WV", "WV", "WV", "WV", "WV", "WV", "WV", "WV", "WV", "WV", "WV",
"WV", "WV", "WV", "GA", "GA", "FL", "FL", "FL", "FL", "FL", "FL",
"AL", "AL", "AL", "TN", "TN", "TN", "MS", "MS", "MS", "KY", "KY",
"KY", "KY", "KY", "KY", "KY", "KY", "KY", "OH", "OH", "OH", "OH",
"OH", "OH", "OH", "OH", "OH", "OH", "OH", "OH", "OH", "OH", "IN",
"IA", "IA", "IA", "IA", "WI", "MN", "MN", "MN", "MN", "MN", "SD",
"SD", "ND", "ND", "ND", "ND", "ND", "MO", "MO", "MO", "MO", "MO",
"MO", "MO", "MO", "MO", "MO", "MO", "MO", "KS", "KS", "KS", "KS",
"KS", "KS", "KS", "16441", "92946", "8970", "19937", "94589",
"50615", "75915", "50005", "23037", "14835", "83678", "66263",
"60818", "82760", "42137", "32888", "35385", "20242", "98269",
"16216", "76562", "49327", "30699", "1866", "91301", "75125",
"34016", "88673", "78612", "85008", "91030", "57276", "96772",
"79568", "59489", "14154", "71655", "78163", "41673", "19942",
"19364", "34004", "79349", "1611", "8875", "19673", "5422", "42395",
"11899", "26967", "73499", "79916", "71015", "73640", "39759",
"7735", "84853", "31662", "43183", "44787", "79001", "82999",
"17031", "88109", "62215", "56040", "66592", "59148", "20786",
"30106", "46561", "9125", "83512", "60031", "65233", "49512",
"8893", "46275", "11362", "29867", "61573", "46363", "91510",
"19267", "45554", "41193", "54267", "8045", "28089", "62450",
"69082", "66685", "80769", "15446", "62589", "42875", "74723",
"2934", "18540", "96540", "60812", "50636", "90924", "60556",
"90009", "15287", "35529", "28702", "82102", "96967", "5296",
"64804", "48743", "10867", "60914", "83678", "77883", "97631",
"97175", "48103", "63128", "46774", "18285", "74512", "69313",
"80414", "32394", "51103", "51155", "28672", "38460", "89024",
"49443", "2016-01-23 12:14:07", "2016-01-17 19:22:37", "2016-01-23 22:41:32",
"2016-01-27 09:58:34", "2016-01-30 08:40:06", "2016-01-28 01:41:40",
"2016-01-27 08:22:27", "2016-01-28 00:13:48", "2016-01-20 12:31:12",
"2016-01-17 08:25:30", "2016-01-28 13:01:36", "2016-01-20 12:10:46",
"2016-01-25 07:32:01", "2016-01-23 02:13:11", "2016-01-24 11:14:46",
"2016-01-16 20:59:35", "2016-01-19 20:12:58", "2016-01-19 06:38:06",
"2016-01-27 10:15:48", "2016-01-26 14:00:30", "2016-01-28 01:54:45",
"2016-01-27 05:43:58", "2016-01-25 22:07:06", "2016-01-18 09:58:05",
"2016-01-20 05:56:54", "2016-01-26 08:05:32", "2016-01-28 14:18:45",
"2016-01-22 06:25:48", "2016-01-27 18:05:50", "2016-01-16 11:33:47",
"2016-01-22 03:31:52", "2016-01-23 05:41:37", "2016-01-27 00:55:22",
"2016-01-16 17:19:51", "2016-01-18 10:05:42", "2016-01-22 10:20:16",
"2016-01-26 21:07:20", "2016-01-17 19:12:00", "2016-01-19 17:59:45",
"2016-01-28 08:50:18", "2016-01-16 09:31:52", "2016-01-24 14:50:13",
"2016-01-17 14:02:36", "2016-01-20 17:08:29", "2016-01-25 16:42:03",
"2016-01-19 04:18:27", "2016-01-20 03:05:13", "2016-01-26 23:34:33",
"2016-01-26 13:44:56", "2016-01-16 07:09:41", "2016-01-26 06:43:12",
"2016-01-26 20:22:25", "2016-01-23 05:58:38", "2016-01-19 23:21:00",
"2016-01-16 08:36:10", "2016-01-30 01:21:00", "2016-01-23 11:10:06",
"2016-01-27 15:29:30", "2016-01-30 15:50:38", "2016-01-19 08:32:33",
"2016-01-19 18:18:02", "2016-01-21 14:20:47", "2016-01-17 13:19:59",
"2016-01-20 05:49:06", "2016-01-16 15:54:17", "2016-01-21 09:15:42",
"2016-01-16 07:32:39", "2016-01-28 03:49:00", "2016-01-26 00:19:56",
"2016-01-25 10:29:44", "2016-01-23 06:26:45", "2016-01-29 08:03:34",
"2016-01-22 14:24:34", "2016-01-16 18:44:43", "2016-01-26 00:00:51",
"2016-01-20 17:38:03", "2016-01-17 22:38:47", "2016-01-30 10:12:01",
"2016-01-21 17:00:43", "2016-01-22 08:43:30", "2016-01-27 12:04:58",
"2016-01-25 21:09:40", "2016-01-27 16:35:42", "2016-01-27 20:09:03",
"2016-01-27 09:52:40", "2016-01-26 16:12:37", "2016-01-28 16:57:29",
"2016-01-30 13:48:47", "2016-01-30 19:15:03", "2016-01-24 19:33:56",
"2016-01-28 06:57:55", "2016-01-22 18:21:40", "2016-01-16 02:54:57",
"2016-01-23 08:18:44", "2016-01-20 13:47:54", "2016-01-24 16:23:39",
"2016-01-24 19:15:09", "2016-01-22 14:59:14", "2016-01-30 10:21:43",
"2016-01-27 11:54:39", "2016-01-30 15:19:59", "2016-01-24 19:21:48",
"2016-01-27 07:20:14", "2016-01-25 07:11:55", "2016-01-24 22:33:42",
"2016-01-26 14:30:57", "2016-01-16 13:12:46", "2016-01-28 11:25:45",
"2016-01-28 14:44:25", "2016-01-23 03:25:10", "2016-01-26 13:45:49",
"2016-01-19 06:14:21", "2016-01-25 22:12:29", "2016-01-25 12:13:07",
"2016-01-22 23:56:39", "2016-01-24 07:51:51", "2016-01-24 10:50:30",
"2016-01-21 07:02:41", "2016-01-21 09:52:54", "2016-01-26 22:35:52",
"2016-01-19 06:48:13", "2016-01-19 15:18:21", "2016-01-20 12:20:37",
"2016-01-16 07:04:34", "2016-01-24 10:20:05", "2016-01-25 09:01:09",
"2016-01-21 17:02:29", "2016-01-21 11:52:00", "2016-01-27 19:39:16",
"2016-01-19 18:33:35", "2016-01-18 06:00:23", "2016-01-17 01:27:11",
"2016-01-18 10:27:57", "3355", "4935", "5454", "9555", "5938",
"5855", "4888", "3885", "8533", "4359", "5339", "5554", "5894",
"8598", "5448", "9535", "3495", "3358", "3485", "3344", "8489",
"8553", "3354", "5889", "5948", "8455", "5988", "5595", "9354",
"8485", "4559", "4838", "5585", "5585", "8554", "8598", "5535",
"5355", "5844", "3485", "5885", "8833", "8558", "9889", "9885",
"8555", "3938", "8343", "8558", "5484", "3558", "3545", "8394",
"9933", "3853", "4598", "3855", "5845", "5588", "5495", "8585",
"9584", "3385", "8858", "9445", "8488", "8558", "5838", "5848",
"8845", "8848", "8945", "4599", "8585", "8858", "4598", "5358",
"5395", "9485", "4893", "4455", "8493", "9358", "5395", "8958",
"5888", "8888", "8555", "4885", "3538", "8998", "4445", "4838",
"9885", "3559", "5584", "9594", "8558", "3844", "5434", "8558",
"9898", "4395", "9585", "3858", "4858", "5895", "9383", "9858",
"8385", "5585", "4884", "8359", "8893", "3484", "8383", "5338",
"3544", "9859", "9454", "3539", "3583", "8455", "5983", "4345",
"4943", "5548", "8353", "8993", "8594", "8994", "3958", "3989",
"W sWn ae", "o gogynh ", " ntsnagWe", "aiatteaav", "shiytWngg",
"vvmthethW", "Wynhvrrht", "tttnheviv", "itg oiWhe", "a enotisn",
"ehaothe h", "stmeathng", "i emranth", "tersggtnh", "oeiehvhh ",
"sngeeetvg", "gyyhWatge", "ritnhengs", "etihi s e", "aoeertyWn",
"eeytitys ", "nmnmegome", "n vitsnot", " h i eoht", "ahghtangh",
"ehgn hynh", "ener aeig", "t niaat g", "agtWh eah", "vehi amae",
"enhnnn hg", "ennWhgnea", "tay hnaah", "igntyvrtv", "niesehahn",
" eoavongr", "hi ehhimm", "yovgianWi", "e tnehngg", "eyehtte n",
"at nimnrg", "enesgennW", "mhahnhyet", "tt amtgna", "hehtsoish",
"hyvtanggv", "et v nssn", "inhnahe h", "onahhraWn", "mn iiahsy",
" mymisnsg", "magWoshgr", "i t eneve", "nghy naen", "eyhsyehea",
"i ihntvea", "ththnWyri", "vntv yran", "ynaieere ", "yenre htW",
"ehyWga g ", "ngeagmenh", " nW ytito", "ermhaagvr", "eeWvtr eg",
"etreaehon", "thtWyerme", "hnveWnrta", "htmr ohee", "stitnthsi",
"snthhWh a", "ehhth iny", "shgoovema", " mseynWee", "netmiitnt",
"nvi eao", "t seWWay", "yngnerarm", "ggenitaeh", "n eaogiag",
"mitnetmnh", "not sine ", "ghmhnyhne", "eattnatgh", "vhatngtts",
"tntmegten", "hreyatert", "ggmneheri", "g y en he", "igrt ggrh",
"mehnssith", "gigstgnym", "iathWh ii", "h atynin ", "eiieWmetg",
"noyggtive", " iotneng ", "oveieteen", "shnagrhti", "itooo aWv",
"toreytnny", " henaaWvn", "shehnrh W", "ttrntehgi", "oWait tn ",
"hhshhnthh", "nogeamnme", "iraah thh", "eto ngvgr", "Wno tseie",
"ehnato eW", "anservnhn", "htsyyoarv", "n aththe", "vaneav h",
"tmttvniri", "gtmhgrtgv", "h tmtnvgt", " nnaiygnr", "httot ami",
"hehnheeis", "ihtaneito", "eogh h yg", "eWgeiimv ", "sgnyisihh",
"r ngangW", "teihyaeee", "hrytWnhgi", "nniaeavmh", "iotrWehn ",
" gnvgorht", "vyinaaen ", "tgniiseae", "14", "86", "51", "54",
"90", "15", "23", "49", "6", "45", "65", "55", "53", "52", "55",
"84", "74", "74", "45", "88", "4", "76", "65", "41", "77", "40",
"66", "39", "80", "6", "35", "56", "40", "57", "90", "66", "59",
"30", "98", "31", "55", "12", "29", "67", "85", "16", "94", "87",
"61", "55", "94", "95", "68", "10", "45", "41", "93", "55", "13",
"12", "80", "45", "59", "23", "45", "1", "68", "89", "86", "68",
"46", "50", "57", "78", "85", "40", "53", "26", "67", "75", "29",
"78", "91", "35", "37", "10", "90", "36", "9", "14", "36", "31",
"5", "57", "90", "65", "48", "80", "20", "13", "92", "62", "72",
"71", "52", "50", "16", "92", "79", "9", "97", "78", "69", "50",
"84", "96", "82", "95", "44", "2", "76", "13", "1", "16", "65",
"75", "91", "30", "60", "62", "97", "86", "82", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1"
), .Dim = c(133L, 7L), .Dimnames = list(NULL, c("state", "amounts",
"timestamp", "code", "vendor", "risk", "label")))
The problem is that you tried to turn the entire data.frame into a factor, not just 1 column. That resulted in an atomic vector full of junk, hence the error message you received.
This works:
data_train <- as.data.frame(data_train)
data_train$state <- as.factor(data_train$state)
data_train$amounts <- as.factor(as.character(data_train$amounts))
data_train$timestamp <- as.factor(data_train$timestamp)
data_train$code <- as.factor(data_train$code)
data_train$vender <- as.factor(data_train$vender)
data_train$label <- as.factor(data_train$label)
sbrl_model <- sbrl(data_train, iters=20000, pos_sign="1", neg_sign="0",)
create itemset ...
set transactions ...[48 item(s), 8 transaction(s)] done [0.00s].
sorting and recoding items ... [48 item(s)] done [0.00s].
creating sparse bit matrix ... [48 row(s), 8 column(s)] done [0.00s].
writing ... [48 set(s)] done [0.00s].
Creating S4 object ... done [0.00s].
Eclat
parameter specification:
tidLists support minlen maxlen target ext
FALSE 0.1 1 1 frequent itemsets FALSE
algorithmic control:
sparse sort verbose
7 -2 TRUE
Absolute minimum support count: 12
create itemset ...
set transactions ...[469 item(s), 125 transaction(s)] done [0.00s].
sorting and recoding items ... [4 item(s)] done [0.00s].
creating sparse bit matrix ... [4 row(s), 125 column(s)] done [0.00s].
writing ... [4 set(s)] done [0.00s].
Creating S4 object ... done [0.00s].

Adding a column using the data.table package in R

For an example dataframe:
df = structure(list(country = c("AT", "AT", "AT", "BE", "BE", "BE",
"DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE",
"DE", "DE", "DE"), level = c("1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"
), region = c("AT2", "AT1", "AT3", "BE2", "BE1", "BE3", "DE4",
"DE3", "DE9", "DE7", "DE1", "DEE", "DEG", "DE2", "DED", "DEB",
"DEA", "DEF", "DE6", "DE8"), N = c("348", "707", "648", "952",
"143", "584", "171", "155", "234", "176", "302", "144", "148",
"386", "257", "126", "463", "74", "44", "119"), result = c("24.43",
"26.59", "20.37", "23.53", "16.78", "25.51", "46.2", "43.23",
"41.03", "37.5", "33.44", "58.33", "47.97", "34.46", "39.69",
"31.75", "36.93", "43.24", "36.36", "43.7")), .Names = c("country",
"level", "region", "N", "result"), class = c("data.table", "data.frame"
), row.names = c(NA, -20L))
I am using the following code to create a summary dataframe, listing the max and min values by country:
variable_country <- setDT(df)[order(country), list(min_result = min(result), max_result = max(result)), by = c("country")]
I also wish to include the variable 'level' from 'df'' - how would I do this in R? i.e. my variable_country dataframe would have an extra column to show that these particular countries are at level (1) . The dataframe should just have an extra column, but still three observations (one for each country). All observations for each country are at the same level.
If there is only a single 'level' for each 'country', we can create the summarised dataset with including the first observation of 'level' (level[1L]).
setDT(df)[order(country), list(min_result = min(result),
max_result = max(result), level= level[1L]), by = country]
Having said that, another option would be to use 'level' as the grouping variable, i.e. by = .(country, level)] in the code. (as suggested by #David Arenburg)

Resources