In the following script:
dataset <- read.csv("/home/adam/Desktop/Temp/lrtest.csv")
for(i in 3:ncol(dataset)){
uq <- unique(dataset[,i])
j <- i * 100
for(x in uq){
dataset[,i][dataset[,i] == x] <- j #dataset$nm[dataset$nm == x] <- j
j <- j + 1
}
}
I would like to go though each column and replace each of its String values with numbers. The problem is replacing the values (line 6) results in NA, look at the output.
How can I solve it?
The data:
Class Branch LA_type Method_type Method_call Branch_type Branch_condition Tested_parameter
Goal 12 Smooth public static never called IFNE TRUE String
TreeApp 20 Rugged constructor none IF_ICMPGE FALSE int
Password 4 Smooth private never called IFEQ FALSE int
XMLParser 9 Rugged constructor none IFNONNULL TRUE String
MapClass 33 Smooth public never called IFGT FALSE double
The output:
Class Branch LA_type Method_type Method_call Branch_type Branch_condition Tested_parameter
1 Goal 12 <NA> <NA> <NA> <NA> 700 <NA>
2 TreeApp 20 <NA> <NA> <NA> <NA> 701 <NA>
3 Password 4 <NA> <NA> <NA> <NA> 701 <NA>
4 XMLParser 9 <NA> <NA> <NA> <NA> 700 <NA>
5 MapClass 33 <NA> <NA> <NA> <NA> 701 <NA>
We can use lapply to iterate over column 3 to end of the dataframe, convert the data to factor (which it already is probably) with unique levels and add increasing sequence of 100.
df[3:ncol(df)] <- lapply(3:ncol(df), function(x)
x * 100 + as.integer(factor(df[[x]], levels = unique(df[[x]]))) - 1)
df
# Class Branch LA_type Method_type Method_call Branch_type Branch_condition
#1 Goal 12 300 400 500 600 700
#2 TreeApp 20 301 401 501 601 701
#3 Password 4 300 402 500 602 701
#4 XMLParser 9 301 401 501 603 700
#5 MapClass 33 300 403 500 604 701
# Tested_parameter
#1 800
#2 801
#3 801
#4 800
#5 802
data
df <- structure(list(Class = structure(c(1L, 4L, 3L, 5L, 2L), .Label = c("Goal",
"MapClass", "Password", "TreeApp", "XMLParser"), class = "factor"),
Branch = c(12L, 20L, 4L, 9L, 33L), LA_type = structure(c(2L,
1L, 2L, 1L, 2L), .Label = c("Rugged", "Smooth"), class = "factor"),
Method_type = structure(c(4L, 1L, 2L, 1L, 3L), .Label = c("constructor",
"private", "public", "public_static"), class = "factor"),
Method_call = structure(c(1L, 2L, 1L, 2L, 1L), .Label = c("never_called",
"none"), class = "factor"), Branch_type = structure(c(4L,
1L, 2L, 5L, 3L), .Label = c("IF_ICMPGE", "IFEQ", "IFGT",
"IFNE", "IFNONNULL"), class = "factor"), Branch_condition = c(TRUE,
FALSE, FALSE, TRUE, FALSE), Tested_parameter = structure(c(3L,
2L, 2L, 3L, 1L), .Label = c("double", "int", "String"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
Related
I am new using R. I have two data frames (as below) and I would like to add the information from df2 in df1. The only column in common between both of data frames is "Sample".
So I tried to use this column to merge both data frames.
df1
structure(list(Segment = c(3L, 3L, 3L, 4L, 5L, 6L, 6L, 6L, 7L,
7L), Position = c(838L, 891L, 1204L, 732L, 1550L, 688L, 1167L,
1446L, 950L, 981L), `AA-REF` = structure(c(2L, 5L, 7L, 6L, 1L,
8L, 8L, 1L, 3L, 4L), .Label = c("", "D", "E", "H", "K", "L",
"Q", "T"), class = "factor"), `AA-ALT` = structure(c(4L, 2L,
2L, 3L, NA, 5L, 3L, NA, 1L, 4L), .Label = c("E", "K", "M", "N",
"T"), class = "factor"), SYN = structure(c(2L, 3L, 2L, 2L, 1L,
3L, 2L, 1L, 3L, 2L), .Label = c(" ", "N ", "Y "), class = "factor"),
Sample = c("AO103", "AO103", "AO103", "AO103", "AO103", "AO103",
"AO103", "AO103", "AO103", "AO103")), row.names = c(NA, 10L
), class = "data.frame")
Segment Position AA-REF AA-ALT SYN Sample
1 3 838 D N N AO103
2 3 891 K K Y AO103
3 3 1204 Q K N AO103
4 4 732 L M N AO103
5 5 1550 <NA> AO103
6 6 688 T T Y AO103
7 6 1167 T M N AO103
8 6 1446 <NA> AO103
9 7 950 E E Y AO103
10 7 981 H N N AO103
11 8 199 T N N AO103
12 1 341 T K N AO104
13 1 934 T A N AO104
14 1 1327 L F N AO104
15 1 1349 D G N AO104
df2
structure(list(Sample = c("AO208 ", "AO209 ", "AO210 ", "AO211 ",
"AO212 ", "AO213 ", "AO100 ", "AO101 ", "AO102 ", "AO103 "),
Quail = c(7, 8, 9, 10, 11, 12, 7, 8, 9, 10), day = c(3, 3,
3, 3, 3, 3, 5, 5, 5, 5), Expo = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = " DC ", class = "factor"),
Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = " var", class = "factor")), row.names = c(NA,
10L), class = "data.frame")
Sample Quail day Expo Group
1 AO208 7 3 DC var
2 AO209 8 3 DC var
3 AO210 9 3 DC var
4 AO211 10 3 DC var
5 AO212 11 3 DC var
6 AO213 12 3 DC var
7 AO100 7 5 DC var
8 AO101 8 5 DC var
9 AO102 9 5 DC var
10 AO103 10 5 DC var
11 AO104 11 5 DC var
NOTE: Not all entries in df2$Sample are present in df1$Sample
I would like to get something like the following:
Segment Position AA-REF AA-ALT SYN Sample Quail day Expo Group
1 3 838 D N N AO103 10 5 DC var
2 3 891 K K Y AO103 10 5 DC var
3 3 1204 Q K N AO103 10 5 DC var
4 4 732 L M N AO103 10 5 DC var
5 5 1550 <NA> AO103 10 5 DC var
6 6 688 T T Y AO103 10 5 DC var
7 6 1167 T M N AO103 10 5 DC var
8 6 1446 <NA> AO103 10 5 DC var
9 7 950 E E Y AO103 10 5 DC var
10 7 981 H N N AO103 10 5 DC var
11 8 199 T N N AO103 10 5 DC var
12 1 341 T K N AO104 11 5 DC var
13 1 934 T A N AO104 11 5 DC var
14 1 1327 L F N AO104 11 5 DC var
15 1 1349 D G N AO104 11 5 DC var
I tried:
x <- merge(df1, df2, by = "Sample", all = TRUE)
Even though this is adding the columns, everything from df2 is placed at the end of the df1.
I also tried using dplyr's left_join (among others) as:
x <- df1 %>%
left_join(df2, by = "Sample")
This adds empty columns from df2 and no information at all.
I have been looking at many merging posts but none of those seem to address my problem.
I also tried match without success.
x <- merge(x=df1, y=df2, by = "Sample", all.x = TRUE)
You only want all of the columns from df1, so you only need all.x.
Shout out to Tanner33 if you want to use dplyr or tidyverse packages.
I have this sample:
> a
Ship duration.minutes event Location
1 a NA enter Skagen
2 a 1616 trip <NA>
3 a 4308 stop Copenhagen
4 b 1646 trip <NA>
5 b 5751 stop Gdynia
6 b 75 trip <NA>
7 b 45666 stop Gdansk
8 c 2531 trip <NA>
9 c 5360 stop Szczecin
10 d 287 trip <NA>
I would like to add a new column called "destination", and to add the name of the destination in these cells.
The output would be:
> output
Ship duration.minutes event Location Destination
1 a NA enter Skagen NA
2 a 1616 trip <NA> Copenhagen
3 a 4308 stop Copenhagen <NA>
4 b 1646 trip <NA> Gdynia
5 b 5751 stop Gdynia <NA>
6 b 75 trip <NA> Gdansk
7 b 45666 stop Gdansk <NA>
8 c 2531 trip <NA> Szczecin
9 c 5360 stop Szczecin <NA>
10 d 287 trip <NA> <NA>
It means that it is working per Ship: it would give the destination for the ship a only. It is taking the next Location after a trip for this very ship.
I tried with moves <- setDT(a)[, .(from = Location[-.N], to = Location[-1L]) , Ship] but it does not keep the column duration.minutes:
> dput(moves)
structure(list(Ship = c("a", "a", "b", "b", "b", "c"), from = structure(c(4L,
NA, NA, 3L, NA, NA), .Label = c("Copenhagen", "Gdansk", "Gdynia",
"Skagen", "Szczecin"), class = "factor"), to = structure(c(NA,
1L, 3L, NA, 2L, 5L), .Label = c("Copenhagen", "Gdansk", "Gdynia",
"Skagen", "Szczecin"), class = "factor")), row.names = c(NA,
-6L), class = c("data.table", "data.frame"), .Names = c("Ship",
"from", "to"), .internal.selfref = <pointer: 0x00000000003e0788>)
It looks like this:
> moves
Ship from to
1: a Skagen <NA>
2: a <NA> Copenhagen
3: b <NA> Gdynia
4: b Gdynia <NA>
5: b <NA> Gdansk
6: c <NA> Szczecin
The sample of the data called a is:
> dput(data)
structure(list(Ship = c("a", "a", "a", "b", "b", "b", "b", "c",
"c", "d"), duration.minutes = c(NA, 1616L, 4308L, 1646L, 5751L,
75L, 45666L, 2531L, 5360L, 287L), event = structure(c(1L, 3L,
2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L), .Label = c("enter", "stop",
"trip"), class = "factor"), Location = structure(c(4L, NA, 1L,
NA, 3L, NA, 2L, NA, 5L, NA), .Label = c("Copenhagen", "Gdansk",
"Gdynia", "Skagen", "Szczecin"), class = "factor")), .Names = c("Ship",
"duration.minutes", "event", "Location"), row.names = c(NA, -10L
), class = c("data.table", "data.frame"))
I am afraid it is hard to work with setDT. Is there a way to keep the column duration.minutes?
I'm not sure if this covers all of your use cases, but you could use the lead function to capture the next value for each Ship. It also seems like it makes more sense to have all of the values in a single column, rather than separate Location and Destination columns.
library(tidyverse)
a %>%
group_by(Ship) %>%
mutate(Destination = lead(Location),
Location = coalesce(Location, Destination)) %>%
select(-Destination)
Ship duration.minutes event Location
<chr> <int> <fct> <fct>
1 a NA enter Skagen
2 a 1616 trip Copenhagen
3 a 4308 stop Copenhagen
4 b 1646 trip Gdynia
5 b 5751 stop Gdynia
6 b 75 trip Gdansk
7 b 45666 stop Gdansk
8 c 2531 trip Szczecin
9 c 5360 stop Szczecin
10 d 287 trip <NA>
If you want to keep separate columns, then you can shorten the code to this:
a %>%
group_by(Ship) %>%
mutate(Destination = lead(Location))
For the data sample you provided, fill can also create a single column in one step:
a %>%
group_by(Ship) %>%
fill(Location, .direction="up")
1.Simply like using Ctrl+X select a columns add Ctrl+V into a certain
columns in another table freely.
2.Select bigger table's columns to join smaller table that full_join and by
function cannot do.(also they have different cols names.)
#A table (bigger table)
Manufactor Models date Serial
1 audi r55 21341 34j
2 bmw e44 13214 F34
3 cadillc fr4c 23124 00deaa
4 benz c45z 21415 3rf
5 lexus l56fs 97014 3r
6 toyota de22 75199 2ghre
#B table (smaller table)
Markers Price Types
1 Asaudi 4011 ar55
2 abmw 2334 ae44
3 acadillc 1445 fsr4c
4 fbenz 1455 cdf45z
5 falexus 5551l5 ff6fs
6 12toyota 51242 de22
Expected picture
#B table
Markers Price Types
1 Asaudi 4011 ar55
2 abmw 2334 ae44
3 acadillc 1445 fsr4c
4 fbenz 1455 cdf45z
5 falexus 5551l5 ff6fs
6 12toyota 51242 de22
7 audi NA r55
8 bmw NA e44
9 cadillc NA fr4c
10 benz NA c45z
11 lexus NA l56fs
12 toyota NA de22
Eliminate unnecessary cols in A table firstly, to fit full_join by =c("x col name"="y col name") limitation is the way but it is inefficient .Are there
more clean and efficient way to do that?
Your illustration suggests that you can achieve the expected result using below code snippet
library(dplyr)
A %>%
select(-date, -Serial) %>%
`colnames<-`(c('Markers','Types')) %>%
bind_rows(B,.)
Output is:
Markers Price Types
1 Asaudi 4011 ar55
2 abmw 2334 ae44
3 acadillc 1445 fsr4c
4 fbenz 1455 cdf45z
5 falexus 5551l5 ff6fs
6 12toyota 51242 de22
7 audi <NA> r55
8 bmw <NA> e44
9 cadillc <NA> fr4c
10 benz <NA> c45z
11 lexus <NA> l56fs
12 toyota <NA> de22
Sample data:
> dput(A)
structure(list(Manufactor = structure(c(1L, 3L, 4L, 2L, 5L, 6L
), .Label = c("audi", "benz", "bmw", "cadillc", "lexus", "toyota"
), class = "factor"), Models = structure(c(6L, 3L, 4L, 1L, 5L,
2L), .Label = c("c45z", "de22", "e44", "fr4c", "l56fs", "r55"
), class = "factor"), date = c(21341L, 13214L, 23124L, 21415L,
97014L, 75199L), Serial = structure(c(3L, 6L, 1L, 5L, 4L, 2L), .Label = c("00deaa",
"2ghre", "34j", "3r", "3rf", "F34"), class = "factor")), .Names = c("Manufactor",
"Models", "date", "Serial"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
> dput(B)
structure(list(Markers = structure(c(4L, 2L, 3L, 6L, 5L, 1L), .Label = c("12toyota",
"abmw", "acadillc", "Asaudi", "falexus", "fbenz"), class = "factor"),
Price = structure(c(4L, 3L, 1L, 2L, 6L, 5L), .Label = c("1445",
"1455", "2334", "4011", "51242", "5551l5"), class = "factor"),
Types = structure(c(2L, 1L, 6L, 3L, 5L, 4L), .Label = c("ae44",
"ar55", "cdf45z", "de22", "ff6fs", "fsr4c"), class = "factor")), .Names = c("Markers",
"Price", "Types"), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6"))
I'm trying to obtain the mean "ctrlmeans" of the telephone handle time "Handle" of a single group "Actrl" based on another a variable "Period". I then want to create a new variable "Difference" by subtracting that mean from the "Handle" of each person in the dataframe.
Here's what I did:
> ttp1<-read.csv("ttp1.csv")
> dput(head(ttp1,12))
structure(list(NUID = structure(c(4L, 6L, 7L, 8L, 11L, 12L, 9L,
10L, 1L, 2L, 3L, 5L), .Label = c("A000904", "A024324", "A047744",
"A063828", "A071164", "C833344", "C833345", "C833346", "E254607",
"Y950092", "Z952754", "Z993876"), class = "factor"), Period = c(201415L,
201415L, 201415L, 201415L, 201415L, 201415L, 201416L, 201416L,
201416L, 201416L, 201416L, 201416L), Queue = c(1L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L), Group = structure(c(2L, 4L,
3L, 3L, 3L, 3L, 1L, 4L, 3L, 3L, 3L, 3L), .Label = c("A", "A ",
"ACTRL", "B"), class = "factor"), Handle = c(1013L, 699L, 425L,
450L, 444L, 681L, 532L, 716L, 388L, 307L, 430L, 380L)), .Names = c("NUID",
"Period", "Queue", "Group", "Handle"), row.names = c(NA, 12L), class = "data.frame")
My commands:
> ctrlmeans <- with(subset(ttp1, Group=="ACTRL"), tapply(Handle, Period, mean))
> ctrlmeans
201415 201416
500.00 376.25
> Difference <- ttp1$Handle-ctrlmeans[ttp1$Period]
> Difference
<NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
NA NA NA NA NA NA NA NA NA NA NA NA
Why would I get NA?
If I included an additional grouping variable to the tapply command "queue" how would I do this?
To give you an example of how this would work with the dplyr package if you want to calculate the means of Handle by groups of Period AND Queue:
require(dplyr)
ctrlmeans <- #data.frame to store your results
ttp1 %.% #data.frane to use for analysis
group_by(Period,Queue) %.% #grouping variables (you can add/remove Queue if you like)
filter(Group == "ACTRL") %.% #use only rows where Group == "ACTRL"
summarize(mean.Handle = mean(Handle)) #makes a summary column with means of Handle by group
ttp1 <- inner_join(ttp1,ctrlmeans,by=c("Period","Queue")) #join the ctrlmeans to the ttp1 data frame
ttp1["Diff"] <- with(ttp1, Handle - mean.Handle) #Add column for the differences
#>ttp1
# NUID Period Queue Group Handle mean.Handle Diff
#1 A063828 201415 1 A 1013 437.5 575.5
#2 C833345 201415 1 ACTRL 425 437.5 -12.5
#3 C833346 201415 1 ACTRL 450 437.5 12.5
#4 C833344 201415 2 B 699 562.5 136.5
#5 Z952754 201415 2 ACTRL 444 562.5 -118.5
#6 Z993876 201415 2 ACTRL 681 562.5 118.5
#7 E254607 201416 1 A 532 347.5 184.5
#8 A000904 201416 1 ACTRL 388 347.5 40.5
#9 A024324 201416 1 ACTRL 307 347.5 -40.5
#10 Y950092 201416 2 B 716 405.0 311.0
#11 A047744 201416 2 ACTRL 430 405.0 25.0
#12 A071164 201416 2 ACTRL 380 405.0 -25.0
if you want to calculate only by groups of Period, just remove Queue from the filter statement and from the inner_join statement
This method only works if Period is a character or a factor. Right now it's numeric, so you can change
Difference <- ttp1$Handle-ctrlmeans[as.character(ttp1$Period)]
Also this method only works with one grouping variable. With more than one, you'd probably want to perform some aggregation into a new dataset to get the group summaries, and then merge that back into the larger data.frame and do whatever transformation you need. Or you can look at more advanced data.frame manipulations packages such as plyr. But that is a different question/problem really.
I have a dataframe in long form for which I need to aggregate several observations taken on a particular day.
Example data:
long <- structure(list(Day = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor"),
Genotype = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), View = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1",
"2", "3"), class = "factor"), variable = c(1496L, 1704L,
1738L, 1553L, 1834L, 1421L, 1208L, 1845L, 1325L, 1264L, 1920L,
1735L)), .Names = c("Day", "Genotype", "View", "variable"), row.names = c(NA, -12L),
class = "data.frame")
> long
Day Genotype View variable
1 1 A 1 1496
2 1 A 2 1704
3 1 A 3 1738
4 1 B 1 1553
5 1 B 2 1834
6 1 B 3 1421
7 2 A 1 1208
8 2 A 2 1845
9 2 A 3 1325
10 2 B 1 1264
11 2 B 2 1920
12 2 B 3 1735
I need to aggregate each genotype for each day by taking the cube root of the product of each view. So for genotype A on day 1, (1496 * 1704 * 1738)^(1/3). Final dataframe would look like:
Day Genotype summary
1 1 A 1642.418
2 1 B 1593.633
3 2 A 1434.695
4 2 B 1614.790
Have been going round and round with reshape2 for the last couple of days, but not getting anywhere. Help appreciated!
I'd probably use plyr and ddply for this task:
library(plyr)
ddply(long, .(Day, Genotype), summarize,
summary = prod(variable) ^ (1/3))
#-----
Day Genotype summary
1 1 A 1642.418
2 1 B 1593.633
3 2 A 1434.695
4 2 B 1614.790
Or this with dcast:
dcast(data = long, Day + Genotype ~ .,
value.var = "variable", function(x) prod(x) ^ (1/3))
#-----
Day Genotype NA
1 1 A 1642.418
2 1 B 1593.633
3 2 A 1434.695
4 2 B 1614.790
An other solution without additional packages.
aggregate(list(Summary=long$variable),by=list(Day=long$Day,Genotype=long$Genotype),function(x) prod(x)^(1/length(x)))
Day Genotype Summary
1 1 A 1642.418
2 2 A 1434.695
3 1 B 1593.633
4 2 B 1614.790