apply over dataframe - r

I've got two structures:
max_map <-
structure(list(name = structure(1:11, .Label = c("2-Acetylaminofluorene",
"amsacrine", "aniline", "aspartame", "cyclophosphamide", "doxorubicin",
"indomethacin", "phenacetin", "quercetin", "raloxifene", "urethane"
), class = "factor"), value = c(0.811811403850414, 0.8670680916324,
0.794704077953131, 0.652724115286456, 0.946812003911574, 0.94467294086402,
0.99210186168903, 0.965998352825426, 0.953645104970837, 0.903845608662668,
0.858610554863266)), .Names = c("name", "value"), row.names = c(NA,
-11L), class = "data.frame")
maps <-
structure(list(name = c("2-Acetylaminofluorene", "amsacrine",
"aniline", "aspartame", "cyclophosphamide", "doxorubicin", "indomethacin",
"phenacetin", "quercetin", "raloxifene", "urethane"), avg_relations_fan = c(0.596381660936706,
0.627169363301574, 0.52144016932515, 0.335756276148214, 0.710245148396949,
0.786168090022777, 0.931928694886563, 0.797790600434933, 0.836458734127729,
0.764397331494529, 0.548648356310039), baseline = c(0.441175818174093,
0.661376446637227, 0.470246408568704, 0.325159351267395, 0.664171399502648,
0.75247341151084, 0.894791275258052, 0.79447733086043, 0.791316894314006,
0.593161248492605, 0.546928771024265), baseline_mesh = c(0.511440934523423,
0.635334407445469, 0.466187120416127, 0.292197730456067, 0.712015987803737,
0.774493950979802, 0.936857915628513, 0.776404901563741, 0.786072875131457,
0.586564923115283, 0.602183350788001), standard = c(0.441269542443449,
0.656249151603696, 0.451995996997505, 0.331622681220588, 0.680778834932872,
0.742015626142688, 0.883911615393179, 0.791293422595675, 0.760673562009157,
0.559234401021581, 0.555385232882166), sum_relations_fan = c(0.593111715736251,
0.518197244570419, 0.52676186810563, 0.331234383858585, 0.697489423349489,
0.77249112456473, 0.940506641487552, 0.79946569580319, 0.82893149142568,
0.749819491774919, 0.624830313758535), total = c(0.593111715736251,
0.518197244570419, 0.52676186810563, 0.331234383858585, 0.697489423349489,
0.77249112456473, 0.940506641487552, 0.79946569580319, 0.82893149142568,
0.749819491774919, 0.624830313758535)), .Names = c("name", "avg_relations_fan",
"baseline", "baseline_mesh", "standard", "sum_relations_fan",
"total"), row.names = c(NA, 11L), class = c("cast_df", "data.frame"
), idvars = "name", rdimnames = list(structure(list(name = c("2-Acetylaminofluorene",
"amsacrine", "aniline", "aspartame", "cyclophosphamide", "doxorubicin",
"indomethacin", "phenacetin", "quercetin", "raloxifene", "urethane"
)), .Names = "name", row.names = c("2-Acetylaminofluorene", "amsacrine",
"aniline", "aspartame", "cyclophosphamide", "doxorubicin", "indomethacin",
"phenacetin", "quercetin", "raloxifene", "urethane"), class = "data.frame"),
structure(list(series = c("avg_relations_fan", "baseline",
"baseline_mesh", "standard", "sum_relations_fan", "total"
)), .Names = "series", row.names = c("avg_relations_fan",
"baseline", "baseline_mesh", "standard", "sum_relations_fan",
"total"), class = "data.frame")))
And I'd like to apply the function x/y over the maps dataframe,
where x is the current value and y is the corresponding value along
the name.
I already tried
mapply(function(x,y) {x/y}, t(maps[,!names(maps) %in% c('name')]), arrange(max_map, name)$value)
but that gives me one big list without any names associated. I'd like
the results to be similar to the maps dataframe, just with different values.

I'm just guessing here, but maybe you're looking to do something like this:
m <- merge(maps,max_map)
m[,2:7] <- m[,2:7] / m[,8]
Without the merge and without specifying how many columns you have:
maps[,-1] <- maps[,-1] / max_map$value
again, assuming that both are in identical orders.

Joran's answer is definitely the better way, but this might help you understand mapply better. Each argument is a list, and the shorter of the two is recycled, in this case, the second one.
mapply(function(x,y) {x/y}, maps[,!names(maps) %in% c('name')], list(arrange(max_map, name)$value))
avg_relations_fan baseline baseline_mesh standard sum_relations_fan total
[1,] 0.7346308 0.5434462 0.6299997 0.5435616 0.7306028 0.7306028
[2,] 0.7233219 0.7627734 0.7327388 0.7568600 0.5976431 0.5976431
[3,] 0.6561438 0.5917252 0.5866172 0.5687601 0.6628403 0.6628403
[4,] 0.5143923 0.4981574 0.4476589 0.5080595 0.5074646 0.5074646
[5,] 0.7501438 0.7014818 0.7520141 0.7190222 0.7366715 0.7366715
[6,] 0.8322119 0.7965438 0.8198541 0.7854736 0.8177339 0.8177339
[7,] 0.9393478 0.9019147 0.9443163 0.8909484 0.9479940 0.9479940
[8,] 0.8258716 0.8224417 0.8037332 0.8191457 0.8276057 0.8276057
[9,] 0.8771174 0.8297813 0.8242824 0.7976485 0.8692243 0.8692243
[10,] 0.8457167 0.6562639 0.6489658 0.6187278 0.8295880 0.8295880
[11,] 0.6389956 0.6369928 0.7013463 0.6468418 0.7277226 0.7277226

Related

create data frame from nested entries

I have a data frame test like this:
dput(test)
structure(list(X = 1L, entityId = structure(1L, .Label = "HOST-123", class = "factor"),
displayName = structure(1L, .Label = "server1", class = "factor"),
discoveredName = structure(1L, .Label = "server1", class = "factor"),
firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000,
tags = structure(1L, .Label = "c(\"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\"), c(\"app1\", \"client\", \"org\", \"app1\", \"DATA_CENTER\", \"PURPOSE\", \"REGION\", \"Test\"), c(NA, \"NONE\", \"Host:Environment:test123\", \"111\", \"222\", \"GENERAL\", \"444\", \"555\")", class = "factor")), .Names = c("X",
"entityId", "displayName", "discoveredName", "firstSeenTimestamp",
"lastSeenTimestamp", "tags"), class = "data.frame", row.names = c(NA,
-1L))
There is a column called tags which should become a dataframe. I need to get rid of the first row in tags (which keep saying CONTEXTLESS, expand the second column in tags(make them columns. Lastly I need to insert the 3rd column values in tags under each expanded columns.
For example in needs to look like this:
structure(list(entityId = structure(1L, .Label = "HOST-123", class = "factor"),
displayName = structure(1L, .Label = "server1", class = "factor"),
discoveredName = structure(1L, .Label = "server1", class = "factor"),
firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000,
app1 = NA, client = structure(1L, .Label = "None", class = "factor"),
org = structure(1L, .Label = "Host:Environment:test123", class = "factor"),
app1.1 = 111L, data_center = 222L, purppose = structure(1L, .Label = "general", class = "factor"),
region = 444L, test = 555L), .Names = c("entityId", "displayName",
"discoveredName", "firstSeenTimestamp", "lastSeenTimestamp",
"app1", "client", "org", "app1.1", "data_center", "purppose",
"region", "test"), class = "data.frame", row.names = c(NA, -1L
))
I need to remove the 1st vector that keeps saying "contextless", add the second vector the columns. Each 2nd vector value should be a column name. Last vector should be values of the newly added columns.
If you are willing to drop the first "row" of garbage and then do a ittle cleanup of the parse-side-effects, then this might be a good place to start:
read.table(text=gsub("\\),", ")\n", test$tags[1]), sep=",", skip=1, #drops line
header=TRUE)
c.app1 client org app1 DATA_CENTER PURPOSE REGION Test.
1 c(NA NONE Host:Environment:test123 111 222 GENERAL 444 555)
The read.table function uses the scan function which doesn't know that "c(" and ")" are meaningful. The other alternative might be to try eval(parse(text= .)) (which would know that they are enclosing vectors) on the the second and third lines, but I couldn't see a clean way to do that. I initially tried to separate the lines using strsplit, but that caused me to loose the parens.
Here's a stab at some cleanup via that addition of some more gsub operations:
read.table(text=gsub("c\\(|\\)","", # gets rid of enclosing "c(" and ")"
gsub("\\),", "\n", # inserts line breaks
test$tags[1])),
sep=",", #lets commas be parsed
skip=1, #drops line
header=TRUE) # converts to colnames
app1 client org app1.1 DATA_CENTER PURPOSE REGION Test
1 NA NONE Host:Environment:test123 111 222 GENERAL 444 555
The reason for the added ".1" in the second instance of app1 is that R colnames in dataframes need to be unique unless you override that with check.names=FALSE
Here is a tidyverse approach
library(dplyr)
library(tidyr)
str2dataframe <- function(txt, keep = "all") {
# If you can confirm that all vectors are of the same length, then we can make them into columns of a data.frame
out <- eval(parse(text = paste0("data.frame(", as.character(txt),")")))
# rename columns as X1, X2, ...
nms <- make.names(seq_along(out), unique = TRUE)
if (keep == "all")
keep <- nms
`names<-`(out, nms)[, keep]
}
df %>%
mutate(
tags = lapply(tags, str2dataframe, -1L),
tags = lapply(tags, function(d) within(d, X2 <- make.unique(X2)))
) %>%
unnest(tags) %>%
pivot_wider(names_from = "X2", values_from = "X3")
df looks like this
> df
X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp
1 1 HOST-123 server1 server1 1.59386e+12 1.60321e+12
tags
1 c("CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS"), c("app1", "client", "org", "app1", "DATA_CENTER", "PURPOSE", "REGION", "Test"), c(NA, "NONE", "Host:Environment:test123", "111", "222", "GENERAL", "444", "555")
Output looks like this
# A tibble: 1 x 14
X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp app1 client org app1.1 DATA_CENTER PURPOSE REGION Test
<int> <fct> <fct> <fct> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 HOST-123 server1 server1 1593860000000 1603210000000 NA NONE Host:Environment:test123 111 222 GENERAL 444 555

Match strings from main df with those in reference df - if found, add all cols from that row of the reference df into main df

I have a "main_df" along the lines of this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture")), row.names = c(NA,
-6L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), reshapeWide = list(
v.names = NULL, timevar = "index", idvar = c("study_id",
"question"), times = c("rt", "response"), varying = structure(c("response.rt",
"response.response"), .Dim = 1:2)), groups = structure(list(
study_id = "02ipnnqgeovkrxz", .rows = list(1:6)), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
and a reference df along the lines of this:
structure(list(stim = c("ashtray_word", "bell_word", "blouse_word",
"boot_word", "bottle_word", "bread_word"), url = c("eW1BRoUDV4BKQMl",
"5zKTGwHlwlzpssB", "55SVfoQudZJNCFT", "bOORR1zuKYSnAe9", "6RrOQfDZim81pHv",
"1F97ouH0HrwQOgZ"), study_list = c("A", "A", "A", "A", "A", "A"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Each value in the 'question' column of the main df can be found in the 'url' column of the reference df. I want to match these values, and add all columns from that row of the reference df to my main df. The output will look like this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture"), stim = c("chisel_picture",
"raccoon_picture", "apple_picture", "belt_picture", "bicycle_picture",
"cake_picture"), url = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ",
"40iyFKjeMEFGI2V", "6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"
), study_list = c("B FILLER", "B FILLER", "B", "B", "B", "B")), row.names = c(NA,
-6L), groups = structure(list(study_id = "02ipnnqgeovkrxz", .rows = list(
1:6)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))
This will allow me to see the 'sensible' item names (e.g. "chisel_picture") that subjects were responding to, as opposed to the nonsensical code names I have now (e.g. "3eEVJgaAP6c9FPL"). The same items appear over and over again in the 'question' column (as different subjects saw the same items), and I need to preserve these repeats.
I have successfully managed this using a for loop...but it's super slow! A tidyverse solution would be amazing!
My awful for loop (study_data = main df / image_urls = reference df):
all_study_stim_items <- study_data$question # List all values in 'question' column.
matched_items <- tibble() # Create empty tibble to store results of for loop.
for (i in all_study_stim_items) {
temp <- image_urls %>%
filter(url == i) %>%
select(stim, url, study_list)
matched_items <- bind_rows(matched_items, temp) } # Continuously overwrite tibble with each match.
# I then join this with the main df.

values become NA after use left_join() function in r

The value in another data frame becomes NA after I used left_join() function. And I check the answer at here[dplyr::left_join produce NA values for new joined columns.
I also specify the by arguement but failed.
I don't know why.
qx_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020-01-21", "2020-01-22", "2020-01-23", "2020-01-24",
"2020-01-25"), PRS = c(1013.9035387141, 1011.48779584751, 1014.28302402211,
1019.16970261716, 1018.92203467498), PRS_Sea = c(1024.73084750567,
1022.22210612717, 1025.02632842026, 1029.97905104403, 1029.77650132275
), PRS_Max = c(1014.26828869048, 1011.80445613662, 1014.51628117914,
1019.43671957672, 1019.31935504063), PRS_Min = c(1013.7138513322,
1011.13447054516, 1013.86811271731, 1018.75406934996, 1018.62469257842
), WIN_S_Max = c(2.30187606292517, 2.08586132369615, 2.76893908257748,
4.22074853552532, 3.63427225056689), WIN_S_Inst_Max = c(3.44360343442933,
3.09963836923658, 4.28499952758881, 6.68930898053666, 5.80619165721844
), WIN_D_INST_Max = c(116.878029336735, 218.745851048753, 120.88310303288,
72.1640447845805, 72.0331526360544), WIN_D_Avg_2mi = c(116.23329724764,
210.524530689871, 113.104009452075, 68.7694017991261, 70.322008604388
), WIN_S_Avg_2mi = c(1.77558118386243, 1.49959490740741, 2.20936874055178,
3.47942613851096, 2.99431642101285), WIN_D_S_Max = c(116.68018866665,
218.180671371681, 120.40502999811, 71.0831467309146, 68.3670670351474
), TEM = c(3.81968088624339, 5.16464226662887, 6.82721856103553,
5.98099596088435, 4.8940626181028), TEM_Max = c(4.04776301492819,
5.35075514928193, 6.97597470238095, 6.15192401266062, 5.07960293839758
), TEM_Min = c(3.49020455404384, 4.95346053004535, 6.65049142573696,
5.85618067365835, 4.76455794123205), RHU = c(85.9359859221466,
96.1710766250945, 91.749678760393, 88.3347741874528, 80.693040202192
), VAP = c(6.98015376984127, 8.55406509826153, 9.08114866780046,
8.27843124055178, 6.98599714191232), RHU_Min = c(83.965092356387,
95.6411387471655, 90.9997401738473, 87.3134436413454, 79.2219635770975
), PRE_1h = c(0.102133763227513, 0.422205333522298, 1488.33246492347,
0.0715384070294785, 372.116791028911)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L))
covid_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020/1/21", "2020/1/22", "2020/1/23", "2020/1/24",
"2020/1/25"), 新增确诊 = c(0L, 1L, 14L, 24L, 21L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
dat2 <- covid_p2 %>% left_join(qx_p2, by = c('province' = 'province', 'date' = 'date'))
dat2
Your date columns are character columns and do not have the same format:
qx_p2$date
# "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" "2020-01-25"
covid_p2$date
# "2020/1/21" "2020/1/22" "2020/1/23" "2020/1/24" "2020/1/25"
You can get them in the same format by, for example, applying as.Date(...):
covid_p2$date <- as.Date(covid_p2$date)
qx_p2$date <- as.Date(qx_p2$date)
After that, your join works.

Arithmetic on summarized dataframe from dplyr in R

I have a large dataset I use dplyr() summarize to generate some means.
Occasionally, I would like to perform arithmetic on that output.
For example, I would like to get the mean of means from the output below, say "m.biomass".
I've tried this mean(data.sum[,7]) and this mean(as.list(data.sum[,7])). Is there a quick and easy way to achieve this?
data.sum <-structure(list(scenario = c("future", "future", "future", "future"
), state = c("fl", "ga", "ok", "va"), m.soc = c(4090.31654013689,
3654.45350562628, 2564.33199749487, 4193.83388887064), m.npp = c(1032.244475,
821.319385, 753.401315, 636.885535), sd.soc = c(56.0344229400332,
97.8553643582118, 68.2248389927858, 79.0739969429246), sd.npp = c(34.9421782033153,
27.6443555578531, 26.0728757486901, 24.0375040705595), m.biomass = c(5322.76631158111,
3936.79457763176, 3591.0902359206, 2888.25308402464), sd.m.biomass = c(3026.59250918009,
2799.40317348016, 2515.10516340438, 2273.45510178843), max.biomass = c(9592.9303,
8105.109, 7272.4896, 6439.2259), time = c("1980-1999", "1980-1999",
"1980-1999", "1980-1999")), .Names = c("scenario", "state", "m.soc",
"m.npp", "sd.soc", "sd.npp", "m.biomass", "sd.m.biomass", "max.biomass",
"time"), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4), vars = list(quote(scenario)), labels = structure(list(
scenario = "future"), class = "data.frame", row.names = c(NA,
-1), vars = list(quote(scenario)), drop = TRUE, .Names = "scenario"), indices = list(0:3))
We can use [[ to extract the column as a vector; as mean only works on a vector or a matrix -- not on a data.frame. If the OP wanted to do this on a single column, use this:
mean(data.sum[[7]])
#[1] 3934.726
If there was only the data.frame class, the data.sum[,7] would be extracting it as a vector, but the tbl_df prevents it to collapse it to vector
For multiple columns, the dplyr also has specialised functions
data.sum %>%
summarise_each(funs(mean), 3:7)

making the first row a header in a dataframe in r

I've seen this asked here: Create header of a dataframe from the first row in the data frame
and here: assign headers based on existing row in dataframe in R
and the solutions offered don't work for me.
When I transpose my dataframe (p1), the header of DF.transpose (p1t) is something new and annoying. and the first row of the p1t is what I would like to use as the header, I tried:
colnames(p1t) = p1t[1, ]
and it doesn't work!
here is how the original df appears:
File Fp1.PD_ShortSOA_FAM Fp1.PD_LongSOA_FAM Fp1.PD_ShortSOA_SEMplus_REAL Fp1.PD_ShortSOA_SEMplus_FICT
sub0001 0,446222 2,524,804 0,272959 1,281,349
sub0002 1,032,688 2,671,048 1,033,278 1,217,817
And here is how the transpose appears:
row.names V1 V2
File sub0001 sub0002
Fp1.PD_ShortSOA_FAM 0,446222 1,032,688
Fp1.PD_LongSOA_FAM 2,524,804 2,671,048
Fp1.PD_ShortSOA_SEMplus_REAL 0,272959 1,033,278
Fp1.PD_ShortSOA_SEMplus_FICT 1,281,349 1,217,817
Fp1.PD_ShortSOA_SEMminus_REAL 0,142739 1,405,100
Fp1.PD_ShortSOA_SEMminus_FICT 1,515,577 -1,990,458
How can I make "File", "sub0001","sub0002" etc... as the header?
Thanks!
Works for me (with a little trick).
x <- read.table(text = "File Fp1.PD_ShortSOA_FAM Fp1.PD_LongSOA_FAM Fp1.PD_ShortSOA_SEMplus_REAL Fp1.PD_ShortSOA_SEMplus_FICT
sub0001 0,446222 2,524,804 0,272959 1,281,349
sub0002 1,032,688 2,671,048 1,033,278 1,217,817",
header = TRUE)
x <- t(x)
colnames(x) <- x[1, ]
x <- x[-1, ]
x
sub0001 sub0002
Fp1.PD_ShortSOA_FAM "0,446222" "1,032,688"
Fp1.PD_LongSOA_FAM "2,524,804" "2,671,048"
Fp1.PD_ShortSOA_SEMplus_REAL "0,272959" "1,033,278"
Fp1.PD_ShortSOA_SEMplus_FICT "1,281,349" "1,217,817"
We can make use of transpose from data.table
library(janitor)
data.table::transpose(x, keep.names = 'File') %>%
row_to_names(1)
# File sub0001 sub0002
#2 Fp1.PD_ShortSOA_FAM 0,446222 1,032,688
#3 Fp1.PD_LongSOA_FAM 2,524,804 2,671,048
#4 Fp1.PD_ShortSOA_SEMplus_REAL 0,272959 1,033,278
#5 Fp1.PD_ShortSOA_SEMplus_FICT 1,281,349 1,217,817
data
x <- structure(list(File = structure(1:2, .Label = c("sub0001", "sub0002"
), class = "factor"), Fp1.PD_ShortSOA_FAM = structure(1:2, .Label = c("0,446222",
"1,032,688"), class = "factor"), Fp1.PD_LongSOA_FAM = structure(1:2, .Label = c("2,524,804",
"2,671,048"), class = "factor"), Fp1.PD_ShortSOA_SEMplus_REAL = structure(1:2, .Label = c("0,272959",
"1,033,278"), class = "factor"), Fp1.PD_ShortSOA_SEMplus_FICT = structure(2:1, .Label = c("1,217,817",
"1,281,349"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))

Resources