Overwrite dataframe columns in R - r

I'm making a dataframe in R.
With this code I'm currently making a dataframe with 1 row.
logs_data_frame <- data.frame(
session_id = 1,
t_minus = NA,
glucose_reading = NA,
activity_type = "test",
duration = "test2",
intensity = "test3",
activity_start = "test4",
bolus_taken = NA,
bolus_3hr = NA,
previous_flow = NA,
new_flow_number = NA,
action_bolus = NA,
action_basaal = NA,
action_carbs = NA,
recommendation_bolus = NA,
recommendation_basaal = NA,
recommendation_carbs = NA,
bolus_1hr = NA,
date = Sys.Date(),
pid = 2
)
Now I'm wondering how to overwrite some values like t_minus and Glucose_reading so that instead of NA they have test 5 and 6 without making another row
Kind regards and thanks for the help!

Related

Conditionally replace cells in data frame based on another data frame

In the interest of learning better coding practices, can anyone show me a more efficient way of solving my problem? Maybe one that doesn't require new columns...
Problem: I have two data frames: one is my main data table (t) and the other contains changes I need to replace in the main table (Manual_changes). Example: Sometimes the CaseID is matched with the wrong EmployeeID in the file.
I can't provide the main data table, but the Manual_changes file looks like this:
Manual_changes = structure(list(`Case ID` = c(46605, 25321, 61790, 43047, 12157,
16173, 94764, 38700, 41798, 56198, 79467, 61907, 89057, 34232,
100189), `Employee ID` = c(NA, NA, NA, NA, NA, NA, NA, NA, 906572,
164978, 145724, 874472, 654830, 846333, 256403), `Age in Days` = c(3,
3, 3, 12, 0, 0, 5, 0, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
temp = merge(t, Manual_changes, by = "Case ID", all.x = TRUE)
temp$`Employee ID.y` = ifelse(is.na(temp$`Employee ID.y`), temp$`Employee ID.x`, temp$`Employee ID.y`)
temp$`Age in Days.y`= ifelse(is.na(temp$`Age in Days.y`), temp$`Age in Days.x`, temp$`Age in Days.y`)
temp$`Age in Days.x` = NULL
temp$`Employee ID.x` = NULL
colnames(temp) = colnames(t)
t = temp
We could use coalesce
library(dplyr)
left_join(t, Manual_changes, by = "Case ID") %>%
mutate(Employee_ID.y = coalesce(`Employee ID.x`, `Employee ID.y`),
`Age in Days.y` = coalesce(`Age in Days.x`, `Age in Days.y`))
Or with data.table
library(data.table)
setDT(t)[Manual_changes,
c('Employee ID', 'Age in Days') :=
.(fcoalesce(`Employee ID.x`, `Employee ID.y`),
fcoalesce(`Age in Days.x`, `Age in Days.y`)),
on = .(`Case ID`)]

r Replace multiple strings in a data frame column with multiple strings from a column of another data frame

I have a dataframe (df1) with a column "PartcipantID". Some ParticipantIDs are wrong and should be replaced with the correct ParticipantID. I have another dataframe (df2) where all Participant IDs appear in columns Goal_ID to T4. The Participant IDs in column "Goal_ID" are the correct IDs.
Now I want to replace all ParticipantIDs in df1 with all Goal_ID ParticipantIDs from df2.
This is my original dataframe (df1):
structure(list(Partcipant_ID = c("AA_SH_RA_91", "AA_SH_RA_91",
"AB_BA_PR_93", "AB_BH_VI_90", "AB_BH_VI_90", "AB_SA_TA_91", "AJ_BO_RA_92",
"AJ_BO_RA_92", "AK_SH_HA_91", "AL_EN_RA_95", "AL_MA_RA_95", "AL_SH_BA_99",
"AM_BO_AB_49", "AM_BO_AB_94", "AM_BO_AB_94", "AM_BO_AB_94", "AN_JA_AN_91",
"AN_KL_GE_11", "AN_KL_WO_91", "AN_MA_DI_95", "AN_MA_DI_95", "AN_SE_RA_95",
"AN_SE_RA_95", "AN_SI_RA_97", "AN_SO_PU_94", "AN_SU_RA_91", "AR_BO_RA_92",
"AR_KA_VI_94", "AR_KA_VI_94", "AS_AR_SO_90", "AS_AR_SU_95", "AS_KU_SO_90",
"AS_MO_AS_97", "AW_SI_OJ_97", "AW_SI_OJ_97", "AY_CH_SU_97", "BH_BE_LD_84",
"BH_BE_LI_83", "BH_BE_LI_83", "BH_BE_LI_84", "BH_KO_SA_87", "BH_PE_AB_89",
"BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"), Start_T2 = structure(c(NA,
NA, NA, NA, 1579514871, 1576658745, NA, 1579098225, NA, NA, 1576663067,
1576844759, NA, 1577330639, NA, NA, 1576693930, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1577718380, 1577718380, 1577454467, NA,
NA, 1576352237, NA, NA, NA, NA, 1576420656, 1576420656, NA, NA,
1578031772, 1576872938, NA, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_T2 = structure(c(NA, NA, NA, NA, 1579515709,
1576660469, NA, 1579098989, NA, NA, 1576693776, 1576845312, NA,
1577331721, NA, NA, 1576694799, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1577719049, 1577719049, 1577455167, NA, NA, 1576352397,
NA, NA, NA, NA, 1576421607, 1576421607, NA, NA, 1578032408, 1576873875,
NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
45L), class = "data.frame")
And this is the reference data frame (df2):
structure(list(Goal_ID = c("AJ_BO_RA_92", "AL_EN_RA_95", "AM_BO_AB_49",
"AS_KU_SO_90", "BH_BE_LI_84", "BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"
), T2 = c("AJ_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", "AS_AR_SO_90",
"BH_BE_LI_83", "BH_YA_SA_87", "BI_NA_PR_94", "BI_NA_PR_94"),
T3 = c("AR_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83",
NA, "BI_CH_PR_94", "BI_CH_PR_94"), T4 = c("AJ_BO_RA_92",
"AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83", "BH_KO_SA_87",
"BI_CH_PR_94", "BI_CH_PR_94")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
For example, in my df1, I want
"AR_BO_RA_92" to be replaced by "AJ_BO_RA_92";
"AL_MA_RA_95" to be replaced by "AL_EN_RA_95";
"AM_BO_AB_94" to be replaced by "AM_BO_AB_49"
and so on...
I thought about using string_replace and I started with this:
df1$Partcipant_ID <- str_replace(df1$Partcipant_ID, "AR_BO_RA_92", "AJ_BO_RA_92")
But that is of course very unefficient because I have so many replacements and it would be nice to make use of my reference data frame. I just cannot figure it out myself.
I hope this is understandable. Please ask if you need additional information.
Thank you so much already!
You can use match to find where the string is located and excange those which have been found and are not NA like:
i <- match(df1$Partcipant_ID, unlist(df2[-1])) %% nrow(df2)
j <- !is.na(i)
df1$Partcipant_ID[j] <- df2$Goal_ID[i[j]]
df1$Partcipant_ID
# [1] "AA_SH_RA_91" "AA_SH_RA_91" "AB_BA_PR_93" "AB_BH_VI_90" "AB_BH_VI_90"
# [6] "AB_SA_TA_91" "AJ_BO_RA_92" "AJ_BO_RA_92" "AK_SH_HA_91" "AL_EN_RA_95"
#[11] "AL_MA_RA_95" "AL_SH_BA_99" "AM_BO_AB_49" "AM_BO_AB_94" "AM_BO_AB_94"
#[16] "AM_BO_AB_94" "AN_JA_AN_91" "AN_KL_GE_11" "AN_KL_WO_91" "AN_MA_DI_95"
#[21] "AN_MA_DI_95" "AN_SE_RA_95" "AN_SE_RA_95" "AN_SI_RA_97" "AN_SO_PU_94"
#[26] "AN_SU_RA_91" "AR_BO_RA_92" "AR_KA_VI_94" "AR_KA_VI_94" "AS_AR_SO_90"
#[31] "AS_AR_SU_95" "AS_KU_SO_90" "AS_MO_AS_97" "AW_SI_OJ_97" "AW_SI_OJ_97"
#[36] "AY_CH_SU_97" "BH_BE_LD_84" "BH_BE_LI_83" "BH_BE_LI_83" "BH_BE_LI_84"
#[41] "BH_KO_SA_87" "BH_PE_AB_89" "BH_YA_SA_87" "BI_CH_PR_94" "BI_CH_PR_94"
I think this might work. Create a true look up table with a column of correct and incorrect codes. I.e. stack the columns, then join the subsequent df3 to df1 and use coalesce to create a new part_id. You spelt participant wrong, which made me feel more human I always do that.
library(dplyr)
df3 <- df2[1:2] %>%
bind_rows(df2[c(1,3)] %>% rename(T2 = T3),
df2[c(1,4)] %>% rename(T2 = T4)) %>%
distinct()
df1 %>%
left_join(df3, by = c("Partcipant_ID" = "T2")) %>%
mutate(Goal_ID = coalesce(Goal_ID, Partcipant_ID)) %>%
select(Goal_ID, Partcipant_ID, Start_T2, End_T2)

linear regression model with dplyr on sepcified columns by name

I have the following data frame, each row containing four dates ("y") and four measurements ("x"):
df = structure(list(x1 = c(69.772808673525, NA, 53.13125414839,
17.3033274666411,
NA, 38.6120670385487, 57.7229000792707, 40.7654208618078, 38.9010405201831,
65.7108936694177), y1 = c(0.765671296296296, NA, 1.37539351851852,
0.550277777777778, NA, 0.83037037037037, 0.0254398148148148,
0.380671296296296, 1.368125, 2.5250462962963), x2 = c(81.3285388496182,
NA, NA, 44.369872853302, NA, 61.0746827226573, 66.3965114460601,
41.4256874481852, 49.5461413070349, 47.0936997726146), y2 =
c(6.58287037037037,
NA, NA, 9.09377314814815, NA, 7.00127314814815, 6.46597222222222,
6.2462962962963, 6.76976851851852, 8.12449074074074), x3 = c(NA,
60.4976916064608, NA, 45.3575294731303, 45.159758146854, 71.8459173097114,
NA, 37.9485456227131, 44.6307631013742, 52.4523342186143), y3 = c(NA,
12.0026157407407, NA, 13.5601157407407, 16.1213657407407, 15.6431018518519,
NA, 15.8986805555556, 13.1395138888889, 17.9432638888889), x4 = c(NA,
NA, NA, 57.3383407228293, NA, 59.3921356160536, 67.4231673171527,
31.853845252547, NA, NA), y4 = c(NA, NA, NA, 18.258125, NA,
19.6074768518519,
20.9696527777778, 23.7176851851852, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I would like to create an additional column containing the slope of all the y's versus all the x's, for each row (each row is a patient with these 4 measurements).
Here is what I have so far:
df <- df %>% mutate(Slope = lm(vars(starts_with("y") ~
vars(starts_with("x"), data = .)
I am getting an error:
invalid type (list) for variable 'vars(starts_with("y"))'...
What am I doing wrong, and how can I calculate the rowwise slope?
You are using a tidyverse syntax but your data is not tidy...
Maybe you should rearrange your data.frame and rethink the way you store your data.
Here is how to do it in a quick and dirty way (at least if I understood your explanations correctly):
df <- merge(reshape(df[,(1:4)*2-1], dir="long", varying = list(1:4), v.names = "x", idvar = "patient"),
reshape(df[,(1:4)*2], dir="long", varying = list(1:4), v.names = "y", idvar = "patient"))
df$patient <- factor(df$patient)
Then you could loop over the patients, perform a linear regression and get the slopes as a vector:
sapply(levels(df$patient), function(pat) {
coef(lm(y~x,df[df$patient==pat,],na.action = "na.omit"))[2]
})

How to dput() a raster

If I use dput() to output the structure of a raster object created using the raster package, then assigning that structure back into a new object throws an error
Error in datanotation %in% c("LOG1S", "INT1S", "INT2S", "INT4S", "INT1U", :
error in evaluating the argument 'x' in selecting a method for function '%in%': Error: object 'datanotation' not found
Example output from dput to test this on:
rast <- new("RasterLayer", file = new(".RasterFile"
, name = ""
, datanotation = "FLT4S"
, byteorder = "little"
, nodatavalue = -Inf
, NAchanged = FALSE
, nbands = 1L
, bandorder = "BIL"
, offset = 0L
, toptobottom = TRUE
, blockrows = 0L
, blockcols = 0L
, driver = ""
, open = FALSE
)
, data = new(".SingleLayerData"
, values = c(NA, NA, NA, NA, 27.7696047300953, 25.8297302967319, 21.8282877533719,
18.2355885882618, 27.0557882676846, 27.2210269605054, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 27.7812364734848, 27.405183119753,
24.2674419226904, 21.1096354803572, 19.7839120235376, 28.0337762198564,
30.3552042477317, 27.9129238649901, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 27.8602581108286, 25.5695030720577, 19.308317452836,
20.2224030952562, 19.8943689815922, 26.0737945219631, 29.8730429910469,
30.0356550838097, NA, NA, NA, NA, NA, NA, NA, NA, NA, 27.9364248138976,
26.9457930700303, 23.0304323166943, 19.4650798632613, 19.0999036995668,
17.5193560841074, 27.7251998095169, 28.4496104452209, 28.9315408261731,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 26.8544908125766, 25.0566493895284,
19.392461671792, 17.9138961574326, 18.457466509715, 16.2828861956587,
24.3601694045773, 28.1808209395655, 28.9282707782622, 26.6332021683416,
NA, NA, NA, NA, NA, NA, NA, 25.7558302469057, 25.8550702427802,
22.5693001232205, 19.6993922601795, 16.1425172340908, 18.5221217322922,
15.6749250516081, 23.7808882591915, 26.8347423074187, 27.2630654814702,
25.9184967686647, NA, NA, NA, NA, NA, NA, NA, 24.8123372469289,
21.9120014347897, 21.8593245154305, 18.7720082061109, 19.7574885247249,
18.4980326509342, 16.3585539605331, 24.9138993320561, 25.2434828477134,
24.163634092843, 21.0163621891882, 20.415437668758, NA, NA, NA,
NA, NA, NA, 24.1877819407117, 20.6452893546199, 20.1902008603325,
19.3002926063194, 16.8587312480956, 16.2594198755341, 19.2032612963314,
23.627249155838, 20.2610810034085, 20.512646252079, 21.2108132984962,
21.3929956864179, 22.5462104762584, NA, NA, NA, NA, 28.1377507911064,
26.6783600800768, 21.9226216069185, 18.7325546681671, 19.3040954243679,
18.9295032049331, 16.9754437056141, 18.4150075374079, 22.1472527043877,
23.0212426364059, 24.3613220176048, 23.8262550760194, 23.1817611577951,
23.5871416966677, 24.5249361302642, 24.8507563698565, 24.5380700828535,
24.6222669309606, 28.1310406991608, 26.6318516890262, 22.2093701933002,
18.5946021290531, 18.8365649393596, 19.1392679329481, 18.0261774155026,
15.4867865984622, 22.6594382919435, 24.0000969920539, 26.8590549383737,
25.3828920205212, 24.7396876533108, 24.2529425383968, 25.4417776029091,
25.4515553773556, 25.362837214521, 24.9104771169439, NA, 26.0466484214637,
22.5547365784066, 21.385068811716, 21.7011412999039, 21.5908931968994,
17.5042944634609, 16.1420136345859, 22.2389789351034, 24.9668657681713,
27.1669375893459, 26.7413589409561, 26.421134458086, 26.219636989708,
26.3182362528439, 26.4198369697735, 26.3436222765849, NA, NA,
NA, 23.0141531354431, 25.0932401677589, 21.6662460243741, 20.5190520941524,
18.375683158989, 21.0476911567136, 24.5643254483451, 26.328155553503,
26.272891752264, 27.9174179692592, 27.8104921435185, 27.3675508861065,
27.0387799062499, 26.7967626268208, NA, NA, NA, NA, 23.9459211033352,
27.0411266756461, 24.5366020483741, 21.827679045105, 20.9547907819176,
22.0691273897516, 23.4745553057174, 26.3462307150211, 28.3701287602482,
27.3758861302374, 27.3750656061461, 27.962003557149, 27.5345722331493,
NA, NA, NA, NA, NA, NA, 28.2432491874035, 25.6912040459346, 23.8651528078732,
21.6046243626329, 22.9131480337219, 23.819129269607, 26.3568262380137,
28.9188481472128, 28.1497370861287, 27.7360100735352, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 25.045874725646, 23.2493769507419,
23.1730515314323, 24.2038209656421, 28.3416254663092, 28.8959112067936,
NA, NA, NA, NA, NA, NA, NA)
, offset = 0
, gain = 1
, inmemory = TRUE
, fromdisk = FALSE
, isfactor = FALSE
, attributes = list()
, haveminmax = TRUE
, min = 15.4867865984622
, max = 30.3552042477317
, band = 1L
, unit = ""
, names = "MAT_eclp"
)
, legend = new(".RasterLegend"
, type = character(0)
, values = logical(0)
, color = logical(0)
, names = logical(0)
, colortable = logical(0)
)
, title = character(0)
, extent = new("Extent"
, xmin = 832565.530013465
, xmax = 2452565.53001346
, ymin = 383803.949813352
, ymax = 1733803.94981335
)
, rotated = FALSE
, rotation = new(".Rotation"
, geotrans = numeric(0)
, transfun = function ()
NULL
)
, ncols = 18L
, nrows = 15L
, crs = new("CRS"
, projargs = "+proj=aea +lat_1=20 +lat_2=-23 +lat_0=0 +lon_0=25 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0"
)
, history = list()
, z = list()
)
Or, another minimal example to dput(), then try assigning output to another object:
library(raster)
r1 <- raster(nrow=10, ncol=10)
values(r1) <- runif(ncell(r1))
dput(r1)
How can I load a raster from the text version and avoid this error?
This is a bug, due to an error in the validity check of the .RasterFile object (part of the RasterLayer). Illustrated here:
x <- new(".RasterFile")
validObject(x)
#Error in datanotation %in% c("LOG1S", "INT1S", "INT2S", "INT4S", "INT1U", :
# object 'datanotation' not found
I fixed this in version 2.5-11 (available from R-Forge in an hour or so: install.packages("raster", repos="http://R-Forge.R-project.org") ).
P.S. why would you want to use dput/dget?

How to create a proper dataset for boxplots

I'm having trouble to create a proper boxplot of my dataset. All of the solutions on this platform don't work because their dataset all look different with variables against each other.
So I want to ask: how do I need to format my dataset if it only contains 3 variables and their measured values in 3 columns. In the boxplot examples here, they plot a variable against another one but here this is not the case right?
Using boxplot(data) gives me 3 boxplots. But I want to show the MEAN and also the population size on each boxplot. I don't know how to use the solution as they are all about ggplot2 or boxplot with variables against each other.
I know that this must be simple, but I think I'm plotting the boxplots on a bad method and that's why the solutions on this site don't work?
Data:
structure(list(Rest = c(3.479386607, 3.478445796, 2.52227462,
1.726115552, 3.917693859, 2.300840122), Peat = c(16.79515746,
22.76673699, 24.43289941, 15.64168939, 31.60459098, 16.2369787
), Top.culture = c(8.288, 8.732, 5.199, 6.539, 3.248, 10.156)), .Names = c("Rest",
"Peat", "Top.culture"), row.names = c(NA, 6L), class = "data.frame")
If text annotation is what is meant by 'show the mean and also the population size' then:
boxplot(dat)
text(1:3, 12.5, paste( "Mean= ",round(sapply(dat,mean, na.rm=TRUE), 2),
"\n N= ",
sapply(dat, function(x) length( x[!is.na(x)] ) )
) )
This used your more complex data-object from the other (duplicated) question.
dat <- structure(list(Rest = c(3.479386607, 3.478445796, 2.52227462, 1.726115552, 3.917693859, 2.300840122, 2.326307503, 2.344828287, 4.654278623, 3.68669447, 3.343706863, 0.712228306, 2.735897248, 1.936723375, 2.724260325, 2.069633651, 1.741484154, 2.304391217, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Peat = c(16.79515746, 22.76673699, 24.43289941, 15.64168939, 31.60459098, 16.2369787, 32.63285246, 35.91852324, 19.27802839, 21.78974576, 30.39119451, 35.4846573, 42.21807817, 42.00913743, 40.96996704, 19.85075354, 17.247096, 22.81689524, 43.35990368, 37.57273508, 23.76889902, 38.34604591, 20.98376674, 16.44173119, 17.27639888, NA, NA, NA, NA, NA, NA), Top.culture = c(8.288, 8.732, 5.199, 6.539, 3.248, 10.156, 3.436, 5.584, 4.483, 2.087, 3.28, 2.71, 2.196, 4.971, 4.475, 6.361, 5.49, 9.085, 3.52, 5.772, 9.308, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Rest", "Peat", "Top.culture" ), class = "data.frame", row.names = c(NA, -31L))

Resources