Collapse rows to lowest complete row in each set - r

I am cleaning a gigantic dataset, whcih came from using tabulizer() on a PDF.
The columns are correctly delineated, but I have many rows where one cell in the original was gigantic, and tabulizer read it as several rows, with all cells blank except the large one. I need to collapse the data frame so that rows are collapsed "down" to the lowest complete row.
Here is a sample of what the data look like:
As you can see, the column that these "extra rows" appear in varies by row (in one case it's species, in other cases it's area.of.operation. I want to collapse them to complete rows, such that row 1 remains intact, row 2 is actually rows 2:6 collapsed, row 7 is intact, etc. I don't even know if R is the best tool to use for this, but I would love if there is a dplyr solution. Example dataframe below.
Thank you in advance.
mydata <- structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 11L, 12L, 13L,
17L), target.species = structure(c(4L, 1L, 1L, 1L, 1L, 5L, 4L,
1L, 1L, 2L, 3L), .Label = c("", "hake", "hake, southern", "rosefish",
"squid, cuttlefish,"), class = "factor"), gear = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 2L), .Label = c("", "trawl, bottom",
"trawl, midwater"), class = "factor"), number.boats = structure(c(2L,
1L, 1L, 1L, 1L, 3L, 5L, 1L, 1L, 4L, 4L), .Label = c("", "18 vessels",
"98 refrigerated high", "none provided", "seas vessels"), class = "factor"),
company = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 2L), .Label = c("", "not applicable"), class = "factor"),
area.of.operation = structure(c(2L, 1L, 1L, 1L, 3L, 4L, 2L,
3L, 4L, 2L, 5L), .Label = c("", "above provinces", "annual fishery; EEZ",
"concentrated around", "deepwater coastal"), class = "factor"),
species = structure(c(6L, 3L, 4L, 5L, 9L, 8L, 7L, 9L, 8L,
1L, 2L), .Label = c("Fur seal", "none provided", "otter",
"otter, river", "porpoise", "seal", "Seal", "South American Sea lion,",
"spectacled porpoise,"), class = "factor"), estimates = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L), .Label = c("", "none"
), class = "factor")), class = "data.frame", row.names = c(NA,
-11L))

The old cumsum-split-ting strategem with pasting each column with collapse="," and then sub-ing out the excess commas gets you most of the way:
t( as.data.frame( # transpose because of the column oriented nature of R's apply returns
lapply( split(mydata, cumsum( mydata$target.species != "")),
function(d){ sub(",.*,", ",", lapply( d, paste, collapse=","))})))
[,1] [,2] [,3] [,4] [,5] [,6]
X1 "1,5" "rosefish," "trawl," "18 vessels," "not applicable," "above provinces,annual fishery; EEZ"
X2 "6" "squid," "" "98 refrigerated high" "" "concentrated around"
X3 "7,12" "rosefish," "trawl," "seas vessels," "not applicable," "above provinces,concentrated around"
X4 "13" "hake" "trawl, midwater" "none provided" "not applicable" "above provinces"
X5 "17" "hake, southern" "trawl, bottom" "none provided" "not applicable" "deepwater coastal"
[,7] [,8]
X1 "seal," "none,"
X2 "South American Sea lion," ""
X3 "Seal," "none,"
X4 "Fur seal" "none"
X5 "none provided" "none"

Related

Regex for 11, 12 etc but not 1

I'm trying to filter a data set, and only keep Scenarios 11, 12, 13, 14 etc (but not Scenario 1).
My input looks like this:
structure(list(Title.1 = structure(c(1L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 2L, 3L, 4L, 5L, 6L, 1L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 2L, 3L, 4L, 5L, 6L), .Label = c("Scenario 1",
"Scenario 10", "Scenario 11", "Scenario 12", "Scenario 13", "Scenario 14",
"Scenario 2", "Scenario 3", "Scenario 4", "Scenario 5", "Scenario 6",
"Scenario 7", "Scenario 8", "Scenario 9"), class = "factor"),
Color = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Blue", "Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-28L))
and my output would ideally look like this:
structure(list(Title.1 = structure(c(2L, 3L, 4L, 5L, 6L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Scenario 10", "Scenario 11",
"Scenario 12", "Scenario 13", "Scenario 14"), class = "factor"),
Color = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "Blue"), class = "factor")), class = "data.frame", row.names = c(NA,
-28L))
I could define what I'm keeping (ie. keep = c('Scenario 11', 'Scenario 12' etc) and then filter it by "Title" and then filter it again by 'Color', but I'm sure I could do it in one line with a Regex.
My issue is that I can't seem to tell it to ignore Scenario 1.
Could someone please point me in the right direction?
If you want to do this with regex we can use grepl to select rows which have "1" followed by another digit along with Color = 'Blue'.
subset(df, grepl('\\b1\\d\\b', Title.1) & Color == 'Blue')
# Title.1 Color
#24 Scenario 10 Blue
#25 Scenario 11 Blue
#26 Scenario 12 Blue
#27 Scenario 13 Blue
#28 Scenario 14 Blue
We can use filter with str_detect from tidyverse
library(dplyr)
library(stringr)
df %>%
filter(str_detect(Title.1, '\\b1\\d\\b') & Color == 'Blue')

Bar chart showing NA bar when there are no NA values

My visualisation is showing an NA bar chart despite the fact that I have imputed all NA values in my incomeLev column and explicitly removed all NA values from the mental health (which is in my stacked bar visualisation)
brfss2013$mentalHealth <- forcats::fct_explicit_na(brfss2013$mentalHealth, na_level = "Missing")
brfss2013$incomeLev <- as.factor(brfss2013$incomeLev)
brfss2013 <- subset(brfss2013, !is.na(incomeLev))
brfss2013 %>%
add_count(incomeLev) %>%
rename(count_inc = n) %>%
count(incomeLev, mentalHealth, count_inc) %>%
rename(count_mentalHealth = n) %>%
mutate(percent= count_mentalHealth / count_inc) %>%
mutate(incomeLev = factor(incomeLev,
levels=c('0-$20k','25-$35k','35-$50k','50-$75k','>$75k')))%>%
ggplot(aes(x= incomeLev,
y= count_mentalHealth,
group= mentalHealth)) +
xlab('Annual Income')+ylab('Number of People')+
geom_bar(aes(fill=mentalHealth),
stat="identity",na.rm=TRUE)+
# Using the scales package does the percent formatting for you
geom_text(aes(label = scales::percent(percent)),position = position_stack(vjust = 0.5))+
theme_minimal()
Here is a sample of my data:
brfss2013<-structure(list(incomeLev = structure(c(5L, 1L, 1L, 5L, 4L, 1L,
1L, 4L, 1L, 3L), .Label = c(">$75k", "0-$20k", "25-$35k", "35-$50k",
"50-$75"), class = "factor"), healtheat = c(4.66, 1.68, 2.37,
1.85, 2.5, 3, 3.66, 4.27, 2.72, 1.72), X_age_g = structure(c(5L,
4L, 5L, 5L, 6L, 4L, 3L, 5L, 4L, 6L), .Label = c("Age 18 to 24",
"Age 25 to 34", "Age 35 to 44", "Age 45 to 54", "Age 55 to 64",
"Age 65 or older"), class = "factor"), employ1 = structure(c(7L,
1L, 1L, 7L, 7L, 1L, 1L, 7L, 7L, 5L), .Label = c("Employed for wages",
"Self-employed", "Out of work for 1 year or more", "Out of work for less than 1 year",
"A homemaker", "A student", "Retired", "Unable to work"), class = "factor"),
renthom1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L), .Label = c("Own", "Rent", "Other arrangement"), class = "factor"),
sex = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("Male",
"Female"), class = "factor"), physLev = structure(c(3L, 1L,
3L, 1L, 2L, 1L, 2L, 1L, 2L, 2L), .Label = c("0-200", "200-500",
"500-1000", "1000-2000", "2000-4000", "4000-10000", ">10000"
), class = "factor"), mentalHealth = structure(c(5L, 1L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L), .Label = c("Excellent",
"Good", "Ok", "Bad", "Very Bad", "Missing"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")

Warning message In `[<-.factor`(`*tmp*`, iseq, value = foo) : invalid factor level, NA generated when trying to add vector to row subset

I'm writing a function that attempts to add values in a single row of a data.frame in several columns at once:
require(stringr)
addPointsToKeyRow = function(df, keyRowNum, searchStringForPointColNames, pointsVector){
colsWithMatchingSearchResults = str_match(colnames(df), searchStringForPointColNames)
pointColNums = (which(!is.na(colsWithMatchingSearchResults)))
pointsVectorCleaned = pointsVector[!is.na(pointsVector)]
print(is.vector(pointsVectorCleaned)) #Returns TRUE
print(is.data.frame(pointsVectorCleaned)) #Returns FALSE
print(pointsVectorCleaned)
if(length(pointsVectorCleaned) == length(pointColNums)){
newDf = data.frame(df, stringsAsFactors = FALSE)
newDf[keyRowNum, pointColNums] = as.character(pointsVectorCleaned)
#for(i in 1:length(pointColNums)){
# newDf[keyRowNum,pointColNums[i]]=as.character(pointsVectorCleaned[i])
#}
print(newDf[keyRowNum,])
}
}
When I apply the function to my data (addPointsToKeyRow(finalDf, which(finalDf[,1]=="key"), "points_q", pointVals)), I get the following warnings:
In [<-.factor(*tmp*, iseq, value = "2") :
invalid factor level, NA generated
I've looked for the error on SO and other sites, and the recommendation always seems to be to make sure your data.frame has stringsAsFactors = FALSE.
I think my issue might be that when I subset the data.frame (newDf[keyRowNum, pointColNums]), it no longer keeps stringsAsFactors = FALSE.
Regardless of whether that's the issue or not, I'd very much welcome some help solving this weird issue. Many thanks in advance!
For the sake of an example, let's say df is:
df = structure(list(first = structure(c(7L, 9L, 5L, 4L, 10L, 2L, 3L,
6L, 1L, 8L), .Label = c("autumn", "spring", "summer", "winter",
"july", "betty", "november", "echo", "victor", "tango"), class = "factor"),
last = structure(c(6L, 2L, 4L, 5L, 1L, 8L, 3L, 9L, 10L, 7L
), .Label = c("brummett1", "do", "drorbaugh", "galeno", "gerber",
"key", "lyons", "pecsok", "perezfranco", "swatt"), class = "factor"),
question1 = structure(c(1L, 1L, 1L, 4L, 6L, 2L, 5L, 3L, 5L,
5L), .Label = c("0", "0.25", "1:02:01", "1:2 50%", "2-Jan",
"50%"), class = "factor"), points_q1 = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question2 = structure(c(8L, 10L, 6L, 5L, 2L, 3L, 7L, 1L,
4L, 9L), .Label = c(" a | b; A| Aa | Ab; b| ab | bb; the possibility that the offspring will be heterozygous is about 25%. The same goes for the homozygous recessive it is a 1:1:1:1",
"1/4 heterozygous for \xf1a\xee and 0 recessive for \xf1b\xee",
"16-Mar", "2-Jan", "3:1 25%", "4-Jan", "Male=aabb Female=AAbb Heterozygous is going to be 1/2. Homozygous is going to be 1/4.",
"possible offspring genotypes (each with probability of 0.25): AABb AaBb AAbb Aabb. Question is asking about probability of Aabb_ which is 0.25.",
"The square shows Ab Ab_ Bb Bb so 50% or 1/2. ", "Xa Yb (father) crossed with XA Xb (mother) = 1/2 "
), class = "factor"), points_q2 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question3 = structure(c(4L, 5L, 3L, 5L, 5L, 5L, 7L, 2L, 6L,
1L), .Label = c("Codominance", "coheritance", "incomplete dominance",
"Incomplete dominance", "Incomplete dominance ", "Incomplete dominance. ",
"Independent Assortment"), class = "factor"), points_q3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question4 = structure(c(3L, 4L, 2L, 3L, 6L, 3L, 7L, 1L, 5L,
4L), .Label = c("", "co-dominance", "Codominance", "Codominance ",
"Codominance. ", "Codominant ", "Independent Assortment? (Wrong)"
), class = "factor"), points_q4 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question5 = structure(c(2L, 10L, 6L, 4L, 5L, 3L, 8L, 1L,
7L, 9L), .Label = c(" X | Y; X| XX | XY; x| Xx | xY; the percentage will be 25 % or 1/4 the same applies to the son ",
"0 for daughter_ because male can only give non-colorblind X chromosome (because he's not colorblind an only has one X chromosome). 0.25 for both son and colorblind.",
"0.25", "25% for son and 25% for daughter", "25% for the son and 25% for the daughter ",
"4-Jan", "50%", "Father=XY Mother=X2Y Therefore_ by using the punnet square_ I was able to show/understand that the probability of them having a son AND him being colorblind is 1/4.",
"To have a son or daughter is 50/50. To have a colorblind daughter is .25 whereas to have a colorblind son is .75 because it is carried on the X chromosome and the son is much more likely to inherit this because he has less x to work with",
"XcY (father) XC Xc (mother) Daughter is 1/4 son 1/4"), class = "factor"),
points_q5 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "", class = "factor"), question6 = structure(c(3L,
6L, 7L, 8L, 5L, 2L, 10L, 9L, 4L, 1L), .Label = c("Chromatids ",
"Chromosomes (diploids)", "homologous chromosome pairs",
"Homologous chromosome pairs are being separated. ", "Homologous chromosomes ",
"Homologous pairs ", "homologous pairs of chromosomes", "Homologus Chromosomes ",
"sister chromatids ", "Sister Chromatids?"), class = "factor"),
points_q6 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "", class = "factor"), question7 = structure(c(6L,
8L, 5L, 7L, 8L, 2L, 3L, 1L, 9L, 4L), .Label = c("", "Chromatids (haploids)",
"Daughter Chromosomes?", "One cell to 2", "sister chromatids",
"Sister chromatids", "Sister Chromatids", "Sister chromatids ",
"Sister chromatids within daughter cells are separating. "
), class = "factor"), points_q7 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question8 = structure(c(1L, 4L, 1L, 2L, 4L, 2L, 3L, 6L, 5L,
3L), .Label = c("sister chromatids", "Sister chromatids",
"Sister Chromatids", "Sister chromatids ", "Sister chromatids are held together by the centromeres. In prophase chromosomes become visible. During metaphase chromosomes attach to spindles. During Anaphase the chromosomes are split apart and in telophase the cells start to create cleavage. ",
"sisters chromatides"), class = "factor"), points_q8 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question9 = structure(c(2L, 4L, 1L, 3L, 4L, 3L, 3L, 2L, 5L,
3L), .Label = c("prohase ", "prophase", "Prophase", "Prophase ",
"They condense during prophase before the rest of the phases. "
), class = "factor"), points_q9 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question10 = structure(c(1L, 3L, 1L, 2L, 3L, 2L, 2L, 1L,
4L, 2L), .Label = c("anaphase", "Anaphase", "Anaphase ",
"During anaphase. "), class = "factor"), points_q10 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question11 = structure(c(3L, 4L, 3L, 4L, 4L, 4L, 4L, 3L,
1L, 2L), .Label = c("During prophase. ", "Telephase ", "telophase",
"Telophase"), class = "factor"), points_q11 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question12 = structure(c(1L, 3L, 1L, 2L, 3L, 2L, 3L, 1L,
4L, 2L), .Label = c("metaphase", "Metaphase", "Metaphase ",
"Metaphase. "), class = "factor"), points_q12 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question13 = structure(c(1L, 4L, 1L, 4L, 2L, 4L, 2L, 5L,
3L, 6L), .Label = c("centromere", "Centromere", "Centromere. ",
"Centromeres", "centromeres ", "Cleavage"), class = "factor"),
points_q13 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "", class = "factor")), .Names = c("first",
"last", "question1", "points_q1", "question2", "points_q2", "question3",
"points_q3", "question4", "points_q4", "question5", "points_q5",
"question6", "points_q6", "question7", "points_q7", "question8",
"points_q8", "question9", "points_q9", "question10", "points_q10",
"question11", "points_q11", "question12", "points_q12", "question13",
"points_q13"), row.names = c(NA, -10L), class = "data.frame")
which(finalDf[,1]=="key") is 1.
pointVals is c(NA, "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1",
"1", "1")
For clarification, I'd want the final table to look something like:
First Last question1 points_q1 question2 points_q2 etc.
key key 0 2 "possible_offspring_genotypes..." 1 etc.
I have reduced your function based on my understanding , let me know if it gives what you want or if I have misunderstood something
addPointsToKeyRow = function(df, keyRowNum, searchString, pointsVector) {
#Find columns which has searchString in it
cols <- grepl(searchString, colnames(df))
#Check if the columns with searchString and length of pointsVector is the same
if (sum(cols) == length(pointsVector)) {
#Assign the value
df[keyRowNum,cols] <- pointsVector
}
#Return the updated dataframe
df
}
#Convert all the variables in the column from factor to character
df[] <- lapply(df, as.character)
#define the values to be replaced
pointVals <- c("2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1","1", "1")
#Call the function
df <- addPointsToKeyRow(df, 1, "points_q", pointsval)
#Check the dataframe
df

Creating output based on multiple criteria

I have 2 data frames for 2 stacks that gives information about potential emission. One data frame gives the time frame of what hours the system turn on and off for 4 seasons. Each season start on specific date. The 2nd file give me the details of the stack.
I am trying with some sample file to test how to do this and so far I have managed to create a function following stack overflow example that allow me to create a data frame with the dates that I would like and a column with seasons for each date. I am really struggling now with the programming concept to understand how do I combine the 3 data frames to create the output template that I am trying to set up.
To show you an example my sample input are:
Stack_info File:
example seasonal Profile that shows when the system is on or off:
and the output I am after should create data frames for each year in the following format (only the black font and the red text to just explain what the values are):
What is the most difficult I am finding is that my output files for each year will have a unique first Row and the 2nd row will repeat for each pollutant. and from 3rd row the hourly data for all 8760 hours. This need to repeat for the next pollutant.
So far I have managed to create a function that helps me to assign season to each day of the year. For example:
#function to create seasons
d = function(month_day) which(lut$month_day == month_day)
lut = data.frame(all_dates = as.POSIXct("2012-1-1") + ((0:365) * 3600 * 24),
season = NA)
lut = within(lut, { month_day = strftime(all_dates, "%b-%d") })
lut[c(d("Jan-01"):d("Mar-15"), d("Nov-08"):d("Dec-31")), "season"] = "winter"
lut[c(d("Mar-16"):d("Apr-30")), "season"] = "spring"
lut[c(d("May-01"):d("Sep-27")), "season"] = "summer"
lut[c(d("Sep-28"):d("Nov-07")), "season"] = "autumn"
rownames(lut) = lut$month_day
## create date data frame and assign seasons
dates = data.frame(dates =seq(as.Date('2010-01-01'),as.Date('2012-12-31'),by = 1))
dates = within(dates, {
season = lut[strftime(dates, "%b-%d"), "season"]
})
This gives me a dates data frame and my other 2 samples data frames are (as shown in the image):
structure(list(`Source no` = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Source = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("Stack 1", "Stack 2"), class = "factor"),
Period = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Day = structure(c(2L,
6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L,
7L, 5L, 1L, 3L, 4L), .Label = c("Fri", "Mon", "Sat", "Sun",
"Thu", "Tue", "Wed"), class = "factor"), `Spring On` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 15L,
15L, 15L, 15L, 15L, 15L, 15L), `Spring Off` = c(23L, 23L,
23L, 23L, 23L, 23L, 23L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 18L,
18L, 18L, 18L, 18L, 18L, 18L), `Summer On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Summer Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Winter On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "off"), class = "factor"),
`Winter Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("23",
"off"), class = "factor")), .Names = c("Source no", "Source",
"Period", "Day", "Spring On", "Spring Off", "Summer On", "Summer Off",
"Autumn On", "Autumn Off", "Winter On", "Winter Off"), class = "data.frame", row.names = c(NA,
-21L)) -> profile
structure(list(SNAME = structure(1:2, .Label = c("Stack 1", "Stack 2"
), class = "factor"), ISVARY = c(1L, 4L), VELVOL = c(1L, 4L),
TEMPDENS = c(0L, 2L), `DUM 1` = c(999L, 999L), `DUM 2` = c(999L,
999L), NPOL = c(2L, 2L), `EXIT VEL` = c(26.2, 22.4), TEMP = c(341L,
328L), `STACK DIAM` = c(1.5, 2.5), W = c(0L, 15L), Nox = c(39,
33.3), Sox = c(15.5, 17.9)), .Names = c("SNAME", "ISVARY",
"VELVOL", "TEMPDENS", "DUM 1", "DUM 2", "NPOL", "EXIT VEL", "TEMP",
"STACK DIAM", "W", "Nox", "Sox"), class = "data.frame", row.names = c(NA,
-2L)) -> stack_info
If anyone could give me any guidance of how I can proceed with the programming part would be really useful as I am just not sure how I can approach this to create separate output files as data frame for year 2010, 2011 and 2012.
The way your data is organised isn't ideal for processing. Maybe you have a look at Hadley Wickhams papar about tidy data.
According to your desired output you need a dataframe with the number of lines equal to the number of hours a specific machine (stack n) is switched on. Therefore I suggest you create a dataframe containing every hour of a given year:
d.out = data.frame(dates = seq(from=as.POSIXct("2010-01-01"), by=3600, to= as.POSIXct("2010-12-31")))
d.out$year = as.numeric(format(d.out$dates, "%Y"))
d.out$month = as.numeric(format(d.out$dates, "%m"))
d.out$day = as.numeric(format(d.out$dates, "%d"))
d.out$hour = as.numeric(format(d.out$dates, "%H"))
d.out$weekday = as.character(format(d.out$dates, "%a"))
d.out$doj = as.numeric(format(d.out$dates, "%j"))
d.out$season = "Winter"
d.out$season[d.out$doj >= 75 & d.out$doj < 121] = "Spring"
d.out$season[d.out$doj >= 121 & d.out$doj < 271] = "Summer"
d.out$season[d.out$doj >= 271 & d.out$doj < 312] = "Autumn"
The goal is to join this dataframe with your profile dataframe. Before joining, the profile-df has to be rearranged:
library(dplyr)
library(tidyr)
profile_new =
profile %>%
gather(season, hour, -c(`Source no`, Source, Period, Day)) %>%
extract(season, c("season", "status"), "(\\w+?)\\s(\\w+)") %>%
filter(hour != "off") %>%
mutate(Day = as.character(Day), hour=as.numeric(hour)) %>%
spread(status, hour)
Now it's easy to join the three dataframes to put together all the information you need to create your output:
d.out %>%
inner_join(profile_new, by=c("weekday"="Day", "season"="season")) %>%
group_by(Source, dates, year, day, weekday, season, hour) %>%
summarise(status = any(hour >= On & hour <= Off)) %>%
inner_join(stack_info, by=c("Source"="SNAME")) %>%
mutate(Nox = ifelse(status, Nox, 0),
Sox = ifelse(status, Sox, 0)) %>%
arrange(Source, year, dates, hour) %>%
select(Source, year, day, weekday, season, hour, `EXIT VEL`, TEMP, `STACK DIAM`, W, Nox, Sox)
Obviously it's not quite the format you posted. From here you could write your dataframe to a csv (stack by stack by using append = TRUE).

Indicator feature creation in R based on multiple columns

I have a dataset with 10 columns and out of them 10, 3 are of interest to create a new indicator feature. The features are "pT", "pN", & "M" and they all take different values. Off all the values that these 3 features take, there are a toal of 9 unique combinations that needs to be captures in the new variable.
PATHOT PATHON PATHOM
1 pT2 pN1 M0
4 pT1 pN1 M0
13 pT3 pN1 M0
161 pT1 *pN2 M0
391 pT1 pN1 *M1
810 *pTIS pN1 M0
948 pT3 *pN2 M0
1043 pT2 pN1 *M1
1067 *pT4 pN1 M0
For example, the new variable will have value "1" when PATHOT=pT2, PATHON=pN1 & PATHOM=M0 and so on upto value 9. I have completed the task but after spending almost 20 lines of code involving vectorised operation for all unique combinations.
diag3_bs$sfd[diag3_bs$pathot=="pT2" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 1
diag3_bs$sfd[diag3_bs$pathot=="pT1" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 2
diag3_bs$sfd[diag3_bs$pathot=="pT3" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 3... so on upto 9.
I want to ask if there is a better more automated way of getting the same result?
dput(data.frame) is given below
structure(list(F_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), EVENT_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BASELINE", class =
"factor"),
PAG_NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "BR2", class = "factor"), PTSIZE = c(3, 4,
2.7, 2, 0.9, 3, 3, 0.9, 3, 4.5), PTSIZE_U = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CM", class = "factor"),
PT_SYM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "-", "<", ">"), class = "factor"), PATHOT = structure(c(4L,
4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("*pT4", "*pTIS",
"pT1", "pT2", "pT3"), class = "factor"), PATHON = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*pN2", "pN1"
), class = "factor"), PATHOM = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*M1", "M0"), class = "factor"),
RSUBJID = 901000:901009, RUSUBJID = structure(1:10, .Label = c(
"000301-000-901-251", "000301-000-901-252", "000301-000-901-253",
"000301-000-901-254", "000301-000-901-255", "000301-000-901-256",
"000301-000-901-257", "000301-000-901-258", "000301-000-901-259",
"000301-000-901-260", "000301-000-901-261", "000301-000-901-262")
, class = "factor")), .Names = c("F_STATUS", "EVENT_ID", "PAG_NAME", "PTSIZE", "PTSIZE_U", "PT_SYM", "PATHOT",
"PATHON", "PATHOM", "RSUBJID", "RUSUBJID"), row.names = c(NA, 10L),
class = "data.frame")
Thanks.
I tried to edit the data so it didn't throw an error on input. Also created a version of that tabulation of possible combinations:
stg_tbl <- structure(list(PATHOT = structure(c(4L, 3L, 5L, 3L, 3L, 2L, 5L,
4L, 1L), .Label = c("*pT4", "*pTIS", "pT1", "pT2", "pT3"), class = "factor"),
PATHON = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("*pN2",
"pN1"), class = "factor"), PATHOM = structure(c(2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L), .Label = c("*M1", "M0"), class = "factor")), .Names = c("PATHOT",
"PATHON", "PATHOM"), class = "data.frame", row.names = c("1",
"4", "13", "161", "391", "810", "948", "1043", "1067"))
Make a vector of text-equivalents of the categories:
stg_lbls <- with(stg_tbl, paste(PATHOT, PATHON, PATHOM, sep="_") )
Then the as.numeric values of a factor created using those levels will be the desired result:
dat$stg <- with(dat, factor( paste(PATHOT, PATHON, PATHOM, sep="_"), levels=stg_lbls))
as.numeric(dat$stg)
#[1] 1 1 1 2 2 1 1 2 1 1
You can just assign those values in the usual way:
dat$sfd <- as.numeric(dat$stg)
I made some new data, that should be useful for your problem.
k<-expand.grid(data.frame(a=letters[1:3],b=letters[4:6],c=letters[7:9]))
library(dplyr)
k %>% mutate(groups=paste0(a,b,c))->k2
k2$groups<-as.numeric(factor(k2$groups))
k2
It's crude, and you're not picking which combination get's which numbers, so it'd take some digging afterwards, but it's quick.

Resources