Warning Messages when running linear regression in R - r

I'm attempting to run a linear regression in R, but get the following errors:
Warning messages:
1: In model.response(mf, "numeric") :
using type = "numeric" with a factor response will be ignored
2: In Ops.factor(y, z$residuals) : ‘-’ not meaningful for factors
The code is:
reg_ex1 <- lm(V45~TotalScore,data = Combineddatainprogresscsv)
Both values, V45, and TotalScore are numerical. A Google search yielded a similar question where it was suggested that the csv file might have commas. But I'm not an expert so don't know how to check this?
Thank you!
There are 1300 lines, so here is just the final part of the output. Let me know if you need more.
"50", "60", "70", "80", "90", "Compared to others who may have taken this test, how well do you think you scored? - 1"
), class = "factor"), V46 = structure(c(23L, 6L, 4L, 22L,
4L, 8L), .Label = c("", "0", "1", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "2", "20", "3", "4",
"5", "6", "7", "8", "9", "Score"), class = "factor"), TotalScore = c(0L,
12L, 10L, 9L, 10L, 14L)), row.names = c(NA, 6L), class = "data.frame")

It seems your response variable V46 is a factor. You can see it in the output you pasted: V46 = structure(c(23L, 6L, 4L, 22L,
4L, 8L), .Label = c("", "0", "1", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "2", "20", "3", "4",
"5", "6", "7", "8", "9", "Score"), class = "factor")
I would suggest converting V46 to character, then to numeric and finally filter out the missing values which will be produced by the "Score" level.
You should definitely listen to the people in the comments so it's easier to help you :)

Related

Axis titles are disappearing in ggplot2 when I try to manually set months

I'm trying to use months as the x-axis for a ggplot graph, but I have multiple times in each day, and I need the times to be in chronological order. I've tried manually setting the breaks and labels to month numbers and names, but now the x-axis is just blank. What should I do?
month_nums <- seq(1, 12)
month_names <- month.abb
time_strs <- paste(moods$Month, moods$Day, moods$Year, moods$Hour, moods$Minute, sep=":")
timestamps <- as.POSIXct(time_strs, format = "%m:%d:%Y:%H:%M", tz="EST")
ggplot(data2, aes(x=timestamps)) +
geom_smooth(aes(y=anxious2, color="Anxiety"), size=1.2) +
geom_smooth(aes(y=happy2, color="Happiness"), size=1.2) +
geom_smooth(aes(y=social2, color="Sociability"), size=1.2) +
scale_color_manual(name="Variables",
values=c("red",
"grey20",
"grey40")) +
scale_x_continuous(name="Timestamps", breaks=month_nums, labels=month_names) +
scale_y_continuous(name="Intensity") +
ggtitle("Emotions")
# moods data
structure(list(Year = c("2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021"),
Month = c("9", "9", "9", "9", "9", "10", "10", "10", "10", "10"),
Day = c("29", "29", "30", "30", "30", "1", "1", "1", "1", "2"),
Hour = c("16", "21", "7", "12", "16", "8", "12", "19", "22", "13"),
Minute = c("24", "52", "58", "53", "18", "42", "24", "49", "18", "27"),
happy = c("3", "4", "5", "3", "4", "5", "2", "3", "1", "1"),
social = c("6", "5", "8", "8", "4", "10", "3", "2", "2", "2"),
anxiety = c("-", "-", "1", "2", "1", "1", "2", "1", "1", "1")),
row.names = 2:11, class = "data.frame")
As timestamps is a datetime/POSIXct, you'll find scale_x_datetime much handier than scale_x_continuous. You can tweek the arguments to show what you prefer.
scale_x_date(date_breaks = "1 month", date_labels = "%b")
Output:
Data:
I needed to change your variable names of the provided data a little bit, add some (smaller) missing parts, as well as extending the range of the data to more points than 2 in order to have some output to show here.
moods <- structure(list(Year = c("2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021"),
Month = c("9", "7", "8", "9", "9", "10", "10", "11", "12", "12"),
Day = c("29", "29", "30", "30", "30", "1", "1", "1", "1", "2"),
Hour = c("16", "21", "7", "12", "16", "8", "12", "19", "22", "13"),
Minute = c("24", "52", "58", "53", "18", "42", "24", "49", "18", "27"),
happy = c("3", "4", "5", "3", "4", "5", "2", "3", "1", "1"),
social = c("6", "5", "8", "8", "4", "10", "3", "2", "2", "2"),
anxiety = c(NA, NA, "1", "2", "1", "1", "2", "1", "1", "1")),
row.names = 2:11, class = "data.frame") |> mutate(across(c(happy:anxiety), as.numeric))
data2 <- moods
data2$time_strs <- paste(moods$Month, moods$Day, moods$Year, moods$Hour, moods$Minute, sep=":")
data2$timestamps <- as.POSIXct(data2$time_strs, format = "%m:%d:%Y:%H:%M", tz="EST")
ggplot(data2, aes(x=timestamps)) +
geom_smooth(aes(y=anxiety, color="Anxiety"), size=1.2) +
geom_smooth(aes(y=happy, color="Happiness"), size=1.2) +
geom_smooth(aes(y=social, color="Sociability"), size=1.2) +
scale_color_manual(name="Variables",
values=c("red",
"grey20",
"grey40")) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +
scale_y_continuous(name="Intensity") +
ggtitle("Emotions")

"Error in rep(1, n) : invalid 'times' argument" when trying to run lars regression in R

I am trying to run a regression using lars in R. However, I keep getting this error :
Error in rep(1, n) : invalid 'times' argument.
Here's my code :
IN: dput(head(LBJ09))
OUT: structure(list(G = c("1", "2", "3", "4", "5", "6"), Date = c("2008-10-28",
"2008-10-30", "2008-11-01", "2008-11-03", "2008-11-05", "2008-11-07"
), Age = c("23-303", "23-305", "23-307", "23-309", "23-311",
"23-313"), Tm = c("CLE", "CLE", "CLE", "CLE", "CLE", "CLE"),
Home = c("#", "", "#", "#", "", ""), Opp = c("BOS", "CHA",
"NOH", "DAL", "CHI", "IND"), GS = c("1", "1", "1", "1", "1",
"1"), MP = c("36:00", "30:16", "37:43", "34:08", "35:50",
"39:20"), FG = c("9", "7", "6", "8", "13", "11"), FGA = c("21",
"15", "15", "20", "23", "24"), FGP = c(".429", ".467", ".400",
".400", ".565", ".458"), `3PM` = c("0", "0", "0", "0", "0",
"1"), `3PA` = c("4", "2", "3", "2", "2", "6"), `3PP` = c(".000",
".000", ".000", ".000", ".000", ".167"), FT = c("4", "8",
"3", "13", "15", "4"), FTA = c("8", "12", "4", "15", "16",
"7"), FTP = c(".500", ".667", ".750", ".867", ".938", ".571"
), ORB = c("1", "2", "2", "2", "2", "2"), DRB = c("6", "7",
"5", "6", "7", "7"), TRB = c("7", "9", "7", "8", "9", "9"
), AST = c("6", "9", "13", "3", "6", "8"), STL = c("2", "0",
"3", "2", "4", "1"), BLK = c("1", "1", "0", "1", "0", "4"
), TOV = c("3", "5", "5", "1", "5", "4"), PF = c("4", "3",
"3", "1", "1", "2"), PTS = c("22", "22", "15", "29", "41",
"27"), GmSc = c("14.1", "17.0", "15.3", "24.0", "36.0", "21.5"
)), row.names = c(NA, 6L), class = "data.frame")
library(lars)
lars09 <- lars(LBJ09$PTS, LBJ09$FG+LBJ09$AST+LBJ09$`3PM`+LBJ09$FT+LBJ09$TRB+
LBJ09$STL+LBJ09$BLK, type = "lasso")
plot(lars09)
I expect the lars package to run successfully since all LBJ variables are numerical data in columns. However, lars does not even run. any ideas ?
Try this code:
# Convert variables from character to numeric
vars <- c("PTS", "FG", "AST", "3PM", "FT", "TRB", "STL", "BLK")
LBJ09[, vars] <- sapply(LBJ09[, vars], as.numeric)
# Create the X matrix of explanatory variables and the y vector with the outcome
X <- as.matrix(LBJ09[, vars[-1]])
y <- LBJ09[, vars[1]]
lars09 <- lars(x=X, y=y, type = "lasso")
plot(lars09)

CSV Import issues in R

After importing a csv file, R separated my data into columns every comma it reads.
My issue is that i had originally two columns where i had different values that are floating numbers, and the other column is the sum of all of these floating number. So R spread these elements in 5 or 6 columns sometimes less columns, sometimes more, depending on the number of commas existing.
There's a facilitation in this issue: the first column is delimited from parenthesis: so for example the first row first column is (-5,5+9)+(-10+12) and the second column would be the sum of this floating numbers. So i can easily see where the first column stops, after the second column (that is the sum of the elements of the first column) there are at least 2 or more empty columns so that i can easily recognize where the second column ends. Now what i have to do is to rearrange my dataset in the original form. I post the structure of the dataset for an easy understanding
here's is the code of the first rows
Y= structure(list(V24 = structure(c(66L, 15L, 44L, 28L, 68L, 10L
), .Label = c("", "(-0", "(-0+7", "(-1", "(-1+11", "(-1+11)+(-13",
"(-1+11)+(-18+18", "(-1+3)+(-10+14)", "(-1+8)", "(-2", "(-2+10",
"(-2+10)", "(-2+10)+(-13", "(-2+11", "(-2+11)", "(-2+11)+(-13",
"(-2+11)+(-14+17)", "(-2+12", "(-2+12)", "(-2+12)+(-14", "(-2+12)+(-14+15",
"(-2+12)+(-14+16)", "(-2+6)+(-8+10)+(-14", "(-2+7", "(-2+7)",
"(-2+7)+(-11", "(-2+7)+(-13", "(-2+8", "(-2+8)+(-10", "(-2+8)+(-11",
"(-2+8)+(-13", "(-2+8)+(-15", "(-2+9", "(-2+9)", "(-2+9)+(-13",
"(-2+9)+(-14", "(-3", "(-3+10", "(-3+10)", "(-3+10)+(-13", "(-3+10)+(-13+14",
"(-3+10)+(-14+14", "(-3+11", "(-3+11)", "(-3+11)+(-13", "(-3+12",
"(-3+12)", "(-3+12)+(-13", "(-3+13)", "(-3+7", "(-3+8", "(-3+8)",
"(-3+8)+(-11+12", "(-3+9", "(-3+9)", "(-4", "(-4+10", "(-4+10)",
"(-4+10)+(-11+12)", "(-4+11", "(-4+11)", "(-4+12", "(-4+12)",
"(-4+13)", "(-4+14)", "(-4+6)+(-9", "(-4+8", "(-4+8)+(-10+14)",
"(-4+9", "(-4+9)+(-10+11)+(-13", "(-4+9)+(-12+13)+(-18+18", "(-4+9)+(-13+14",
"(-4+9)+(-14+15)", "(-4+9)+(-9", "(-5", "(-5+10", "(-5+10)",
"(-5+10)+(-13", "(-5+11)", "(-5+12)", "(-5+13)+(-14", "(-6",
"(1+6)+(-8+9", "S"), class = "factor"), V25 = structure(c(7L,
67L, 66L, 58L, 66L, 54L), .Label = c("", "(-4+11", "(-5", "10",
"12", "25)+(-14+15)", "25+12", "25+14", "3)", "3+6)", "3+7",
"5", "5)", "5)+(-10", "5)+(-11", "5)+(-11+13)+(-14", "5)+(-13",
"5)+(-13+13", "5)+(-13+14", "5)+(-14", "5)+(-16", "5)+(-16+16",
"5)+(-16+17)+(-21+22", "5)+(-17", "5)+(-18+18", "5+10", "5+10)",
"5+10)+(-13", "5+11", "5+11)", "5+11)+(-13", "5+11)+(-17+17",
"5+11)+(-21+21", "5+12", "5+12)", "5+12)+(-13", "5+12)+(-20+20",
"5+13", "5+13-13", "5+14", "5+15", "5+16", "5+16)", "5+18)",
"5+6)+(-14+14", "5+7", "5+7)+(-13", "5+7)+(-15", "5+7)+(-9+12",
"5+8", "5+8)", "5+8)+(-17", "5+9", "5+9)", "5+9)+(-13", "5+9)+(-14",
"5+9)+(-22", "50)", "50+10)+(-14", "50+14", "50+7", "6", "7",
"75)", "75)+(-14+15", "8", "9", "T"), class = "factor"), V26 = structure(c(31L,
1L, 1L, 29L, 1L, 29L), .Label = c("", "10", "11", "25)", "25)+(-14+15",
"25+15", "4", "5", "5)", "5)+(-13", "5)+(-14", "5)+(-16", "5)+(-16+17)",
"5)+(-20+21)", "5+10)+(-13", "5+13", "5+14", "5+14)", "5+14)+(-18+18",
"5+15", "5+15)", "5+16)", "5+17", "5+18", "5+18)", "5+23", "50)",
"50+16", "6", "7", "75)", "75+14", "75+15", "8", "9"), class = "factor"),
V27 = structure(c(9L, 1L, 1L, 9L, 1L, 9L), .Label = c("",
"10", "11", "12", "25", "25)", "25+17", "3", "5", "5)", "5+14",
"5+15)", "5+15)+(-18", "50)", "6", "7", "75", "75)", "8",
"9"), class = "factor"), V28 = structure(c(9L, 12L, 15L,
1L, 8L, 1L), .Label = c("", "1", "10", "11", "2", "25)",
"3", "4", "5", "5)", "5+19", "6", "7", "75", "8", "9"), class = "factor"),
V29 = structure(c(1L, 5L, 10L, 1L, 6L, 1L), .Label = c("",
"25", "2prol", "30", "40", "41", "5", "5)", "50", "52", "75",
"8", "9"), class = "factor"), V30 = structure(c(1L, 6L, 12L,
5L, 7L, 13L), .Label = c("", "25", "3", "3conc", "4", "45",
"46", "5", "52", "56", "6", "60", "8", "9"), class = "factor"),
V31 = structure(c(15L, 7L, 10L, 3L, 8L, 7L), .Label = c("",
"35", "40", "43", "4mot", "5", "52", "53", "54", "55", "56",
"57", "60", "63", "7"), class = "factor"), V32 = c(43L, NA,
NA, 52L, NA, 57L), V33 = c(45L, NA, NA, 59L, NA, 56L), V34 = c(55L,
NA, NA, NA, NA, NA)), row.names = 3:8, class = "data.frame")
So my idea is:
read all the columns, identify and separate the first from the second column: the last element of the first column is highlighted by the closing parenthesis
Working on the first one: i would say "take the value of the next column and add to the previous column adding a comma before"
Working on the second one: Since the second column starts where there's the first element after the closing parenthesis, i would take the first value (in that case we have an integer) or if the following column has a number (so that the column is not empty) add the number of the following column to the first columns linked by a comma.
Note: i have the idea how to do it, but i can't translate these ideas into a code, how can i do this?

Trouble plotting dates with ggplot2

I'm trying to plot against dates in R. I've run into trouble trying to create vertical lines against a plot that I already have. All of the different formats that I try either result in nothing showing up on the plot, or a line at 1970 (the default date). The year-data is in the form yyyy-mm-dd. For example, "1914-07-01".
I've also tried inputting these dates in a data.frame, but got the same problem.
I've been trying to make a reproducible example, but I haven't seen any example datasets to do so with, and got frustrated trying to create one... sorry about that. Here's the relevant code:
ggplot(M,aes(x=date,color=origin,y=value)) +
geom_point() +
geom_line() +
facet_grid(topic~origin) +
geom_vline(xintercept=as.numeric(as.Date("1914-07-01")))
Everything plots correctly without the addition of the final line.
Edit: here's the result of dput(head(M)):
structure(list(topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25"), class = "factor"), date = structure(c(-1767196800, -1765987200,
-1764518400, -1763308800, -1762099200, -1760889600), class = c("POSIXct",
"POSIXt"), tzone = ""), origin = structure(c(2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Blast", "The_Egoist"), class = "factor"),
value = c(6.69960398194253e-07, 7.48757156068349e-07, 7.04834977806836e-07,
7.10226526475778e-07, 6.8295233938925e-07, 6.16466066169137e-07
)), .Names = c("topic", "date", "origin", "value"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), vars = list(
topic, date), drop = TRUE, indices = list(0L, 1L, 2L, 3L,
4L, 5L), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
"13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
"23", "24", "25"), class = "factor"), date = structure(c(-1767196800,
-1765987200, -1764518400, -1763308800, -1762099200, -1760889600
), class = c("POSIXct", "POSIXt"), tzone = "")), class = "data.frame", row.names = c(NA,
-6L), .Names = c("topic", "date"), vars = list(topic, date)))
You were very close, the problem is your data is in POSIXct, and you were trying to convert to Date. To fix it, change to POSIXct:
ggplot(M,aes(x=date,color=origin,y=value)) +
geom_point() +
geom_line() +
facet_grid(topic~origin) +
geom_vline(xintercept=as.numeric(as.POSIXct("1914-07-01")))
You can see the difference in the calls:
as.numeric(as.POSIXct("1914-07-01"))
[1] -1751569200
as.numeric(as.Date("1914-07-01"))
[1] -20273
Explaining why the intecept was so close to 1970 (the 0 for both)

R - plot vertical profile

I have measurements of CH4 concentration with depth:
df <- structure(list(Depth = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 15L, 16L, 17L), .Label = c("0", "10",
"12", "14", "16", "18", "2", "20", "22", "24", "26", "28", "30",
"32", "4", "6", "8", "AR"), class = "factor"), Conc_CH4 = c(4.30769230769231,
23.1846153846154, 14.5615384615385, 21.1769230769231, 16.2615384615385,
132.007692307692, 5.86923076923077, 389.353846153846, 823.023076923077,
948.684615384615, 1436.56923076923, 1939.88461538462, 26.2769230769231,
27.5538461538462, 19.6461538461538)), .Names = c("Depth", "Conc_CH4"
), row.names = c(NA, -15L), class = "data.frame")
And I need to create a plot like this:
But I have some problems: the factors in my data are in the wrong order, and I don't know how to plot this kind of data using ggplot2.
Any ideas?
Here's a solution with base plotting functions (you reverse the limits of ylim):
df$Depth <- as.numeric(as.character(df$Depth))
df <- df[order(df$Depth),]
plot(Depth~Conc_CH4, df, t="l", ylim=rev(range(df$Depth)))
Why not convert Depth to a number and plot?
ggplot(transform(df, Depth=as.numeric(as.character(df$Depth))),
aes(x=Conc_CH4, y=Depth)) +
geom_line() + scale_y_reverse()
The as.numeric(as.character(...)) is because your Depth is a factor and calling as.numeric directly converts factors differently than character to string.
The scale_y_reverse reverses the y scale.
If your actual data has a depth of "AR" in it, you'll have to omit them or otherwise handle them.

Resources