I am trying to split a large dataset and
assign colnames with a loop and
save all individual data back again in a single stacked file
I am using some sample data as follows:
so firstly I split the datasets into 2 based on number of sources in the first column and read in a list using the following code:
out <- split( sample , f = sample$Source)
now I am struggling to set up a loop to change the colnames for coloumn 2 to 8
by matching the existing colnames to the following 'info' table and replacing based on source name as in the first column of the 'info' table.
the info table looks like this:
so the loop should change the colnames similar to this:
I am just wondering if anyone has done something similar could advise me?
also when I try to join them together I can only set the colnames ones using the merge function. is there any way to stack them so that I can preserve the colname for each table and looks something like this? :
my sample input files are:
> dput(sample)
structure(list(Source = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L), .Label = c("Stack 1", "Stack 2"), class = "factor"),
year = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L), day = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), hour = c(0L, 1L, 2L, 3L, 0L, 1L, 2L, 3L, 4L), `EXIT VEL` = c(26.2,
26.2, 26.2, 26.2, 22.4, 22.4, 22.4, 22.4, 22.4), TEMP = c(341L,
341L, 341L, 341L, 328L, 328L, 328L, 328L, 328L), `STACK DIAM` = c(1.5,
1.5, 1.5, 1.5, 2.5, 2.5, 2.5, 2.5, 2.5), W = c(0L, 0L, 0L,
0L, 15L, 15L, 15L, 15L, 15L), Nox = c(39, 39, 39, 39, 33.3,
33.3, 33.3, 33.3, 33.3), Sox = c(15.5, 15.5, 15.5, 15.5,
17.9, 17.9, 17.9, 17.9, 17.9)), .Names = c("Source", "year",
"day", "hour", "EXIT VEL", "TEMP", "STACK DIAM", "W", "Nox",
"Sox"), class = "data.frame", row.names = c(NA, -9L))
> dput(stack_info)
structure(list(SNAME = structure(1:2, .Label = c("Stack 1", "Stack 2"
), class = "factor"), ISVARY = c(1L, 4L), VELVOL = c(1L, 4L),
TEMPDENS = c(0L, 2L), `DUM 1` = c(999L, 999L), `DUM 2` = c(999L,
999L), NPOL = c(2L, 2L), `EXIT VEL` = c(26.2, 22.4), TEMP = c(341L,
328L), `STACK DIAM` = c(1.5, 2.5), W = c(0L, 15L), Nox = c(39,
33.3), Sox = c(15.5, 17.9)), .Names = c("SNAME", "ISVARY",
"VELVOL", "TEMPDENS", "DUM 1", "DUM 2", "NPOL", "EXIT VEL", "TEMP",
"STACK DIAM", "W", "Nox", "Sox"), class = "data.frame", row.names = c(NA,
-2L))
thanks in advance
The best I ended with is this:
out <- split( sample , f = sample$Source) # your original step
stack_info[,1] <- as.character(stack_info[,1]) # To get strings column as strings and not index number later
out <- lapply( names(out), function(x) {
# Get the future names
new_cnames <- unname(unlist(stack_info[stack_info$SNAME == x,1:7]))
# replace the column names
colnames(out[[x]]) <- c("Source",new_cnames,colnames(out[[x]])[9:10] )
# Return the modified version without first column
out[[x]][,-1] })
sapply(out,write.table,append=T,file="",row.names=F,sep="|") # write (change "" to the file name you wish and sep to your desired separator and see ?write.table for more documentation)
The main idea is looping over the DF to change their colnames, I do update the list and loop again to write, you may want to append to file in the first loop.
I hope the comments are enough to get the code, tell me if it needs some details.
Output on screen (omitting warnings):
"Stack 1"|"1"|"1.1"|"0"|"999"|"999.1"|"2"|"Nox"|"Sox"
2010|1|0|26.2|341|1.5|0|39|15.5
2010|1|1|26.2|341|1.5|0|39|15.5
2010|1|2|26.2|341|1.5|0|39|15.5
2010|1|3|26.2|341|1.5|0|39|15.5
"Stack 2"|"4"|"4.1"|"2"|"999"|"999.1"|"2.1"|"Nox"|"Sox"
2010|1|0|22.4|328|2.5|15|33.3|17.9
2010|1|1|22.4|328|2.5|15|33.3|17.9
2010|1|2|22.4|328|2.5|15|33.3|17.9
2010|1|3|22.4|328|2.5|15|33.3|17.9
2010|1|4|22.4|328|2.5|15|33.3|17.9
Related
I've created this function below that produces data that will go in a report in a UI.
However its not necessarily doing what I would like it to in the name and age arguments. It prints out the name and age in connection to how many orders there are. So if i.e. Customer ID 59 made 2 orders - her name will be printed out "Jane" "Jane" - I would like it to not do that.
If anyone has any idea on how to change this, i'd appreciate your input.
CustomerReport <- function(ID, Start_Date, End_Date) {
CustomerOrders <- OrdersData[OrdersData$Customer_ID == ID & OrdersData$Date >= Start_Date & OrdersData$Date <= End_Date,]
ProductOrders <- ItemsInOrders[ItemsInOrders$Order_ID %in% CustomerOrders$Order_ID,]
CustomerInfo <- CustomersData[CustomersData$Customer_ID == ID,]
Name <- paste(CustomerInfo$First_Name, CustomerInfo$Last_Name)
Age <- CustomerInfo$Customer_Age
NumberofOrders <- nrow(CustomerOrders)
MeanTotals <- mean(ProductOrders$Quantities)
MedianTotals <- median(ProductOrders$Quantities)
PercentageType <- table(CustomerOrders$Type)/NumberofOrders
PercentageBreakdown <- table(ProductOrders$Products)/nrow(ItemsInOrders)
Result <- list(Name = Name, Age = Age, NumberofOrders = NumberofOrders, MeanTotals = MeanTotals,
MedianTotals = MedianTotals, PercentageType = PercentageType, PercentageBreakdown = PercentageBreakdown
)
return(Result)
}
#Test the Customer Report Funcion
CustomerReport(1251, "2019-01-01", "2019-01-25")
the dput for the data frames
dput(droplevels(CustomersData[1:5, ]))
structure(list(First_Name = c("Ariel", "Kinshasa", "May", "Gabrielle",
"Jennifer"), Last_Name = c("Dirrim", "Purifoy", "Sue", "Finley",
"Towns"), Customer_ID = c(1251L, 290L, 1714L, 381L, 109L), Customer_DOB = structure(c(11181,
3956, 10632, 9742, 11145), class = "Date"), Customer_Age = c(20,
39, 21, 24, 20)), row.names = c(NA, 5L), class = "data.frame")
dput(droplevels(OrdersData[1:5, ]))
structure(list(Order_ID = c(69L, 3025L, 3549L, 27L, 4561L), Customer_ID = c(1251L,
290L, 1714L, 381L, 109L), Date = structure(c(17899, 17921, 17925,
17923, 17917), class = "Date"), Type = structure(c(2L, 1L, 2L,
2L, 2L), .Label = c("Delivery", "Pick Up"), class = "factor"),
Coupon = c("OFF10", NA, "LARGE10", "LARGE10", "LARGE10"),
Delivery_Fee = c("0", "12", "0", "0", "0"), Sub_Total_Before_Discount = c(27.98,
40.9, 74.94, 91.85, 80.82), Discount = c(2.8, 0, 7.49, 9.19,
8.08), Sub_Total_After_Discount = c(25.18, 40.9, 67.45, 82.66,
72.74), GST = c(2.52, 4.09, 6.74, 8.27, 7.27), Total = c(27.7,
44.99, 74.19, 90.93, 80.01)), row.names = c(NA, 5L), class = "data.frame")
dput(droplevels(ItemsInOrders[1:5, ]))
structure(list(Order_ID = c(69L, 3025L, 3025L, 3549L, 3549L),
Products = structure(c(2L, 4L, 1L, 3L, 5L), .Label = c("BBQ Chicken Pizza",
"Meatlovers Pizza", "Seafood Pizza", "Supreme Pizza", "Vegetarian Pizza"
), class = "factor"), Prices = c(13.99, 13.95, 14.95, 13.99,
10.99), Quantities = c(2L, 1L, 1L, 3L, 3L)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
>
Everything else is perfect. Just the Names and the age are coming out in duplicates or triplicates.
Also, while we are here - is it possible to return the Percentage breakdowns as actual % values rather than 0.1 etc?
I guess you are looking for unique. However, the behaviour you are describing isn't reproducible with the data you provided.
Try to replace the two lines getting the name and age information in your CustomerReport function:
Name <- paste(unique(CustomerInfo$First_Name), unique(CustomerInfo$Last_Name))
Age <- unique(CustomerInfo$Customer_Age)
I am trying to average reps of data, subset one treatment, then make a bar graph of the response and another factor. My plot ends up not working. Any help would be much appreciated.
My data:
data <- structure(list(Sample = c(1011L, 1012L, 1014L, 1024L, 1025L,
1026L), Collection = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"), Irrigation = structure(c(3L, 3L, 3L,
5L, 5L, 5L), .Label = c("Rate1", "Rate2", "Rate3", "Rate4", "Rate5"
), class = "factor"), Variety = structure(c(2L, 1L, 3L, 3L, 2L,
1L), .Label = c("Hodag", "Lamoka", "Snowden"), class = "factor"),
Suc = c(0.7333, 0.4717, 0.5883, 0.6783, 0.8283, 0.6833),
Gluc = c(0.03, 0.04, 0.043, 0.075, 0.057, 0.087), L = c(59.48,
57.59, 59.25, 66.45, 68.29, 65.65), a = c(4.36, 6.85, 3.43,
1.7, 0.78, 2.84), b = c(26.82, 27.6, 26.2, 26.14, 25.37,
27.19), NoDefect = c(100L, 100L, 100L, 92L, 100L, 100L),
Defect = c(0L, 0L, 0L, 8L, 0L, 0L)), row.names = c(NA, 6L
), class = "data.frame")
Averaging between reps:
dataAvgSuc <- data %>%
dplyr::group_by(Collection, Irrigation, Variety) %>%
dplyr::summarise(meanSuc=mean(Suc))
Made 'Collection' a factor:
dataAvgSuc$Collection <- as.factor(dataAvgSuc$Collection)
Subset by variety:
subLamoka <- subset(dataAvgSuc, Variety=="Lamoka")
subHodag <- subset(dataAvgSuc, Variety=="Hodag")
subSnowden <- subset(dataAvgSuc, Variety=="Snowden")
Attempted ggplot:
sucPlot <-ggplot(data=subLamoka, aes(x=dataAvgSuc$Collection,
y=meanSuc)) + geom_bar(stat="identity")
Error code:
Error: Aesthetics must be either length 1 or the same as the data (10):
x, y
However, both the x and y have 30 entries when I look at them.
Trev,
Had some trouble re-generating the issue as the sample data provided are for just 6 observations, not 30. So not sure if the below solution would work for you or not.
I used the code you supplied to create the dataframe:
data <- structure(list(Sample = c(1011L, 1012L, 1014L, 1024L, 1025L, 1026L),
Collection = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"),
Irrigation = structure(c(3L, 3L, 3L,5L, 5L, 5L), .Label = c("Rate1", "Rate2",
"Rate3", "Rate4", "Rate5"
), class = "factor"), Variety = structure(c(2L, 1L, 3L, 3L, 2L,
1L), .Label = c("Hodag", "Lamoka", "Snowden"), class = "factor"),
Suc = c(0.7333, 0.4717, 0.5883, 0.6783, 0.8283, 0.6833),
Gluc = c(0.03, 0.04, 0.043, 0.075, 0.057, 0.087),
L = c(59.48, 57.59, 59.25, 66.45, 68.29, 65.65),
a = c(4.36, 6.85, 3.43, 1.7, 0.78, 2.84),
b = c(26.82, 27.6, 26.2, 26.14, 25.37,27.19),
NoDefect = c(100L, 100L, 100L, 92L, 100L, 100L),
Defect = c(0L, 0L, 0L, 8L, 0L, 0L)),
row.names = c(NA, 6L), class = "data.frame")
data$Collection
However, your Collection factor is defined with two levels but only one is shown in the example. Perhaps this could be why the averages were coming out greater than 1?I modified the code below to have 2 levels of collection represented in the data.
data2 <- structure(list(Sample = c(1011L, 1012L, 1014L, 1024L, 1025L, 1026L),
Collection = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"),
Irrigation = structure(c(3L, 3L, 3L,5L, 5L, 5L), .Label = c("Rate1", "Rate2",
"Rate3", "Rate4", "Rate5"
), class = "factor"), Variety = structure(c(2L, 1L, 3L, 3L, 2L,
1L), .Label = c("Hodag", "Lamoka", "Snowden"), class = "factor"),
Suc = c(0.7333, 0.4717, 0.5883, 0.6783, 0.8283, 0.6833),
Gluc = c(0.03, 0.04, 0.043, 0.075, 0.057, 0.087),
L = c(59.48, 57.59, 59.25, 66.45, 68.29, 65.65),
a = c(4.36, 6.85, 3.43, 1.7, 0.78, 2.84),
b = c(26.82, 27.6, 26.2, 26.14, 25.37,27.19),
NoDefect = c(100L, 100L, 100L, 92L, 100L, 100L),
Defect = c(0L, 0L, 0L, 8L, 0L, 0L)),
row.names = c(NA, 6L), class = "data.frame")
data2$Collection
Since you're using dplyr just keep piping that object into ggplot-- I don't think you would need to create subsets of new dataframes, but can instead graph them all separately with a facet_wrap command. I also am using geom_col instead of geom_bar, which the latter is generally trying to graph count data. Since you want to plot an average, geom_col may be better. Also since the example below is piping to the next line, the "data=" definition typically used in ggplot commands is not needed.
First with data:
data %>%
dplyr::group_by(Collection,Irrigation, Variety) %>%
dplyr::summarise(meanSuc=mean(Suc)) %>%
ggplot(aes(x = Collection, y = meanSuc)) +
geom_col() +
facet_wrap(.~Variety)
Incorporate Irrigation:
data %>%
dplyr::group_by(Collection,Irrigation, Variety) %>%
dplyr::summarise(meanSuc=mean(Suc)) %>%
ggplot(aes(x = Collection, y = meanSuc, fill = Irrigation)) +
geom_col() +
facet_wrap(.~Variety)
And using data2 instead, as defined above, will produce the Collection levels 1 and 2 side by side on the graph. With this method I was able to generate a result and all averages were less than 1, between .4~.8
I have a dataset called dietox which has missing values (NA) for the Feed variable. I need to use conditional selection to create a subset of the data for which the rows with missing values are deleted.
The code I tried was:
dietox[!is.NA[dietox$Feed, ]
... but am not sure if that is right to create a subset.
dput(head(dietox))
dietox <- structure(list(Weight = c(26.5, 27.59999, 36.5, 40.29999, 49.09998,
55.39999), Feed = c(NA, 5.200005, 17.6, 28.5, 45.200001, 56.900002 ),
Time = 1:6, Pig = c(4601L, 4601L, 4601L, 4601L, 4601L, 4601L ),
Evit = c(1L, 1L, 1L, 1L, 1L, 1L), Cu = c(1L, 1L, 1L, 1L, 1L, 1L),
Litter = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("Weight", "Feed", "Time", "Pig", "Evit", "Cu", "Litter"),
row.names = c(NA, 6L), class = "data.frame")
You have the right idea, but is.na is a function and so needs to be used with parenthesis.
dietox[!is.na(dietox$Feed), ]
Sample data:
full<-structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("AKS",
"AOK", "BTX", "GTX", "HKS", "JKS", "LOK", "MKS", "MOK", "PKS",
"SKS", "VTX"), class = "factor"), CT_NT = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("CT", "NT"), class = "factor"), Depth = c(5L,
10L, 15L, 5L, 10L, 15L), Site = c(1L, 1L, 1L, 1L, 1L, 1L), PW = c(22.8,
21.5, 18.2, 22.5, 20.5, 19.2), BD = c(1.1, 1.2, 1.1, 1.3, 1.3,
1.5)), .Names = c("Location", "CT_NT", "Depth", "Site", "PW",
"BD"), row.names = c(NA, 6L), class = "data.frame")
osu<-structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("AKS",
"AOK", "BTX", "GTX", "HKS", "JKS", "LOK", "MKS", "MOK", "PKS",
"SKS", "VTX"), class = "factor"), CT_NT = structure(c(1L, 1L,
1L, 2L, 2L, 2L), .Label = c("CT", "NT"), class = "factor"), Depth = c(5L,
10L, 15L, 5L, 10L, 15L), pH = c(5.1, 5.4, 5.9, 5.2, 5.9, 6.2),
N = c(50, 31, 22, 35, 17, 8), P = c(122, 55, 34, 107, 23,
17), K = c(1301, 1202, 1078, 1196, 1028, 948), OM = c(2.3,
1.8, 1.5, 2.1, 1.4, 1.2), NH4 = c(19.3, 14.5, 11.6, 12.3,
8.6, 8.4), Sand = c(22.5, 25, 25, 25, 22.5, 18.8), Silt = c(56.3,
52.5, 50, 51.3, 52.5, 51.3), Clay = c(21.3, 22.5, 25, 23.8,
25, 30)), .Names = c("Location", "CT_NT", "Depth", "pH",
"N", "P", "K", "OM", "NH4", "Sand", "Silt", "Clay"), row.names = c(NA,
6L), class = "data.frame")
I am trying to join two datasets using left_join in dplyr. To my astonishment, I'm getting duplicate rows that are somehow not being identified as such. After reading all the other answers I could get my hands on here that seemed to address "join" issues (at least I'm not the only one who has them...?), I have tried:
Checking the group types of the joining variables in the two datasets
to ensure they match
Checking that I don't have duplicates within f1 or f2
Checking that the categorical columns I'm using to join are, in fact,
the same length and have the same contents. They're EXACTLY the same,
all the way down to the order I put them in
Explicitly specifying to dplyr to use Location, CT_NT, and Depth to
join
Letting dplyr figure out the joining variables itself Joining in both
orders Using inner_join--I ended up with f1 only
I've used left_join before and not had this issue, and it was with a very similar dataset (the pilot data to this full study, in fact). I thought I understood what left_join was doing, but now I'm wondering if I don't actually. I'm trying to get better with using dplyr, but unfortunately it's a lot of me bashing away at things until something works and I can figure out why it worked so I can reproduce it again later as needed.
Given my inexperience, I'm sure the answer is going to be frustratingly straightforward and simple, to the annoyance of everyone involved. Such is the life of learning to code, I guess. Thank you in advance for dealing with a rookie's doofy questions!
Here's my code:
f1<-full %>% #Build pilot_summary. Pipe pilot to...
group_by(Location,CT_NT,Depth,Site) %>% #group_by to work on CT or NT at each site
summarise_at(5:6,funs(mean)) %>% #calculate site means
ungroup(f1)
f1$Depth<-as.factor(f1$Depth)
f1$Site<-NULL
osu$Texture_Class<-NULL#Take out the texture class column
f2<- osu %>%
group_by(Location,CT_NT,Depth) %>% #group because otherwise R tries to crash on the next line of code...
arrange(Location,CT_NT,Depth) %>% #Put everything in order like f1, just in case
ungroup(f2)
f2$Depth<-as.factor(f2$Depth)
full_summary<-left_join(f1,f2)
I'm completely new to R - really have no clue what I'm doing to be honest. But I really need to run bivariate/multivariate regressions with this data following someone's advice and I'm stuck. Any help is greatly appreciated.
rm(list=ls())
setwd("C:/Users/Bogi/Documents/School/Honors Thesis/Voting and Economic Data")
data<-read.csv("BOGDAN_DATA1.csv")
head(data)
round(cor(data[,-1],use="complete.obs"),1)
Error in cor(data[, -1], use = "complete.obs") : 'x' must be numeric
dput
structure(list(REGION = structure(1:6, .Label = c("Altai Republic",
"Altai Territory", "Amur Region", "Arkhangelsk Region", "Astrakhan region",
"Belgorod region"), class = "factor"), PCT_CHANGE_VOTE = structure(c(2L,
3L, 5L, 4L, 6L, 1L), .Label = c("-13%", "-16%", "-17%", "-25%",
"-26%", "2%"), class = "factor"), PCT_CHANGE_GRP = structure(c(2L,
1L, 4L, 3L, 3L, 4L), .Label = c("10%", "17%", "19%", "27%"), class = "factor"),
PCT_CHANGE_INFLATION = structure(c(1L, 2L, 1L, 3L, 3L, 2L
), .Label = c("-2%", "-3%", "-4%"), class = "factor"), PCT_CHANGE_UNEMP = structure(c(5L,
4L, 1L, 2L, 6L, 3L), .Label = c("-13%", "-14%", "-17%", "-3%",
"5%", "7%"), class = "factor"), POVERTY = c(18.6, 22.6, 20.4,
14.4, 14.2, 8.6), POP_AGE1 = c(25.8, 16.9, 18.5, 17.1, 17.8,
15.2), POP_AGE2 = c(58.8, 59.6, 61.3, 60.4, 60.8, 60.3),
POP_AGE3 = c(15.4, 23.5, 20.2, 22.5, 21.4, 24.5), POP_URBAN = c(28.7,
55.2, 67, 76.2, 66.7, 66.4), POP_RURAL = c(71.3, 44.8, 33,
23.8, 33.3, 33.6), COMPUTER = c(46.4, 54.5, 66.1, 74, 65.1,
55.2), INTERNET = c(32.1, 41, 50.7, 66.5, 60, 50.7)), .Names = c("REGION",
"PCT_CHANGE_VOTE", "PCT_CHANGE_GRP", "PCT_CHANGE_INFLATION",
"PCT_CHANGE_UNEMP", "POVERTY", "POP_AGE1", "POP_AGE2", "POP_AGE3",
"POP_URBAN", "POP_RURAL", "COMPUTER", "INTERNET"), row.names = c(NA,
6L), class = "data.frame")
You could loop the columns 2:5 (lapply(data[2:5], ..)), remove the % in columns 2:5 (gsub('[%]',..)) and convert the columns to numeric. The output from gsub will be character class, convert it to numeric by as.numeric
data[2:5] <- lapply(data[2:5], function(x)
as.numeric(gsub('[%]', '', x)))
Cor1 <- round(cor(data[-1],use="complete.obs"),1)
Or you could remove the % in those columns using awk on shell (assuming ,
as delimiter)
awk 'BEGIN {OFS=FS=","} function SUB(F) {sub(/\%/,"", $F)}{SUB(2);SUB(3);SUB(4);SUB(5)}1' Bogdan.csv > Bogdan2.csv
Read the file with read.csv and run the cor
dat1 <- read.csv('Bogdan2.csv')
Cor2 <- round(cor(dat1[-1], use='complete.obs'), 1)
identical(Cor1, Cor2)
#[1] TRUE