Split function not maintaining structure of dataframe? - r

I am doing hierarchical clustering in R and need all the cluster's elements separately.
When I use following data splits into 3 list of num [1:2628] (no info of columns in original dataframe (dataA) is transferred)
clusterA <- hclust(dist(dataA),method = "single")
NumA = 3
label <- cutree(clusterA, NumA)
clusterXlist<-split(dataA,f=label)
str(clusterXlist[[1]])
how to make shure that it maintains the structure of dataA
edit:
in my case
>str(clusterXlist[[1]])
num [1:2628] 0.0529 -0.3909 -0.4465 0.1 0.8393 ...
where as for dataA
> str(dataA)
num [1:440, 1:6] 0.0529 -0.3909 -0.4465 0.1 0.8393 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
- attr(*, "scaled:center")= Named num [1:6] 12000 5796 7951 3072 2881 ...
..- attr(*, "names")= chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
- attr(*, "scaled:scale")= Named num [1:6] 12647 7380 9503 4855 4768 ...
..- attr(*, "names")= chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
edit2 :
for dataA
> dput(head(dataA,n=20))
structure(c(0.0528730042415329, -0.390857056063646, -0.44652098379972,
0.0999975794271863, 0.839284119671916, -0.204572661537808, 0.00993903725191922,
-0.349583518736614, -0.477357534676238, -0.473957607271904, -0.682697336282181,
0.0905884780058897, 1.55872457204484, 0.728746944991474, 1.00042486502152,
-0.138155475034538, -0.868191050016313, -0.484236457564077, 0.521904849881291,
-0.333690834823332, 0.522972471408079, 0.543838613660349, 0.408073194590386,
-0.623310408164662, -0.0523368792616442, 0.333686752405346, -0.351915064454946,
-0.113851350576777, -0.291078065290861, 0.717677967619194, -0.053285340273111,
-0.63306600713975, 0.883794139056095, 0.0557876760455718, 0.497093035238056,
-0.634420951441845, 0.409157150032062, 0.0488774601048851, 0.0719115132405076,
-0.447303143322465, -0.0410681453901357, 0.170124700204028, -0.0281250860936324,
-0.3925300807586, -0.0792659545334748, -0.297298628211157, -0.10273182626616,
0.15518230654465, -0.185125447641461, 1.15011422238562, 0.528531691780372,
-0.360751187201331, 0.400469064432042, 0.739829765498898, 0.435615257968889,
-0.434621330503326, 0.438772101699743, -0.528063904936618, 0.226000834240152,
0.159180975270399, -0.588697039406295, -0.269829034507317, -0.137379339965946,
0.68636300602308, 0.173661155768845, -0.495590877769126, -0.533904475256987,
-0.288985833251248, -0.545233764836731, -0.394039245717966, 0.273564891153861,
-0.340276616984998, -0.573659982327726, 0.00475174748902491,
-0.572218072744849, -0.551001403168238, -0.605176006067741, -0.459955112363749,
-0.178576756619561, -0.494972916519322, -0.0435191938188023,
0.0863085949200282, 0.13308015693741, -0.498021323377842, -0.23165413161966,
-0.227878848586867, 0.0542186891412866, 0.0921812574154842, -0.244448146341904,
0.952945788892319, 0.649245242698738, -0.489212329634658, 0.209634507324604,
0.802353943473126, 0.456496070080021, -0.40217108193415, 0.341140199633565,
-0.526755422016323, -0.0240135648160378, -0.0762383134363428,
-0.066263629344282, 0.0890496850231094, 2.24074190324533, 0.0933048443208461,
1.29786952218849, -0.0261942126239276, -0.347458739603052, 0.369181005457445,
-0.274766434933383, 0.203229792845712, 0.0777025935624781, -0.364479376793999,
0.498608767430271, -0.327246732938803, 0.228051555415843, -0.394620088486301,
-0.157749554245622, 1.04716972023017, 0.587257919466454, -0.36306099036142
), .Dim = c(20L, 6L), .Dimnames = list(NULL, c("Fresh", "Milk",
"Grocery", "Frozen", "Detergents_Paper", "Delicassen")))
for clusterXlist[[1]] which was obtained by split of dataA
> dput(head(clusterXlist[[1]],n=20))
c(0.0528730042415329, -0.390857056063646, -0.44652098379972,
0.0999975794271863, 0.839284119671916, -0.204572661537808, 0.00993903725191922,
-0.349583518736614, -0.477357534676238, -0.473957607271904, -0.682697336282181,
0.0905884780058897, 1.55872457204484, 0.728746944991474, 1.00042486502152,
-0.138155475034538, -0.868191050016313, -0.484236457564077, 0.521904849881291,
-0.333690834823332)

What you have there is a matrix, not a data frame.
class(dataA)
# [1] "matrix"
The quick and easy way to split() would be to do
split(as.data.frame(dataA), label)
However, this may cause issues in later calculations and you may need to resort to coercing those list elements back to a matrix. I would recommend you use lapply() to split the data, as follows.
clusterXlist <- lapply(
unique(label),
function(i) dataA[label == i, , drop = FALSE]
)
to properly maintain your matrix structure throughout your list elements.
str(clusterXlist[[1]])
# num [1:18, 1:6] 0.0529 -0.3909 0.1 0.8393 -0.2046 ...
# - attr(*, "dimnames")=List of 2
# ..$ : NULL
# ..$ : chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...

Related

Getting values from list (right side of colon)

I'm new to R so I'm sure this is simple but I can't figure it out. You can see the structure of my object n below. I want to loop through n and take each non-null value from the right side of the colon (e.g. "57454470") and apply a function to it.
> str(n)
List of 1
$ :List of 10
..$ 15793766: NULL
..$ 15793767: chr "57454470"
..$ 15793769: chr "123652395"
..$ 15793770: chr "38098549"
..$ 15793771: chr "56864789"
..$ 15793776: chr "38722835"
..$ 15793779: chr "37962343"
..$ 15793784: chr "2100162920"
..$ 15793787: chr "2099439832"
..$ 15793791: chr "37992986"
..- attr(*, "dim")= int 10
..- attr(*, "dimnames")=List of 1
.. ..$ rmaddrs$ReportID: chr [1:10] "15793766" "15793767" "15793769" "15793770" ...
..- attr(*, "call")= language by.data.frame(data = rmaddrs, INDICES = rmaddrs$ReportID, FUN = getValueFromXML)
..- attr(*, "class")= chr "by"
Here is the result of dput:
dput(n[1])
list(structure(list(`15793766` = NULL, `15793767` = "57454470",
`15793769` = "123652395", `15793770` = "38098549", `15793771` = "56864789",
`15793776` = "38722835", `15793779` = "37962343", `15793784` = "2100162920",
`15793787` = "2099439832", `15793791` = "37992986"), .Dim = 10L, .Dimnames = structure(list(
`rmaddrs$ReportID` = c("15793766", "15793767", "15793769",
"15793770", "15793771", "15793776", "15793779", "15793784",
"15793787", "15793791")), .Names = "rmaddrs$ReportID"), call = by.data.frame(data = rmaddrs,
INDICES = rmaddrs$ReportID, FUN = getValueFromXML), class = "by"))
UPDATE: I removed the "print" testing and I'm trying to use mean() for a better test.
sapply(n[1], function(x) mean(x, na.rm=TRUE))
Then I had to use unlist and as.numeric and now I think I have what I need to use my custom function.
The way you are using sapply it prints everything, but then it also returns the object which (since it isn't assigned) is also printed. To avoid the printing of the returned object, you can wrap in invisible() or assign it
invisible(sapply(n[1], print))
xx = sapply(n[1], print)
(Note: this printing is just like if you enter 1 + 1 in the console, the resulting 2 will print. But if you enter x = 1 + 1 nothing prints. I also simplified your sapply by omitting the anonymous function, but that isn't related to your issue.)

Cannot write excel file using xlsx package in R

I am using dplyr to create an object that I then use xlsx to write out to a spreadhseet.
I run the following code:
provFundedProp <- compensationBase2014 %>%
group_by(provinciallyFunded) %>%
summarise(total=sum(fundingRaw)) %>%
mutate(percent = paste0(round(100 * total/sum(total),1), "%"))
Which I then write to the first sheet:
write.xlsx(provFundedProp, file="output/provFundedProp.xlsx",
sheetName="provFundingSector")
This works fine and gives me the file I need.
I then run the following code going down a level:
provFundedServiceDivision <- compensationBase2014 %>%
group_by(serviceDivision,provinciallyFunded) %>%
summarise(total=sum(fundingRaw)) %>%
mutate(percent = paste0(round(100 * total/sum(total),1), "%"))
#write to second sheet
write.xlsx(provFundedServiceDivision, file="output/provFundedSD.xlsx",
sheetName="provFundingSD")
Which gives me the following error:
Error: cannot convert object to a data frame
I am going crazy here. Does anyone have any idea what the heck is going on?
I have tried this with multiple wueries and I have no idea what is up.
class(provFundedServiceDivision) [1] "grouped_df" "tbl_df" "tbl"
"data.frame"
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 6 obs. of 4
variables:
$ serviceDivision : chr "AS" "AS" "CLS" "CLS" ...
$ provinciallyFunded: chr "NPF" "PF" "NPF" "PF" ...
$ total : num 1.90e+06 3.97e+07 2.93e+07 5.70e+08 9.55e+07 ...
$ percent : chr "4.6%" "95.4%" "4.9%" "95.1%" ...
- attr(*, "vars")=List of 1
..$ : symbol serviceDivision
- attr(*, "labels")='data.frame': 3 obs. of 1 variable:
..$ serviceDivision: chr "AS" "CLS" "GS"
..- attr(*, "vars")=List of 1
.. ..$ : symbol serviceDivision
..- attr(*, "drop")= logi TRUE
- attr(*, "indices")=List of 3
..$ : int 0 1
..$ : int 2 3
..$ : int 4 5
- attr(*, "drop")= logi TRUE
- attr(*, "group_sizes")= int 2 2 2
- attr(*, "biggest_group_size")= int 2
> traceback()
7: stop(list(message = "cannot convert object to a data frame",
call = NULL, cppstack = NULL))
6: .Call("dplyr_cbind_all", PACKAGE = "dplyr", dots)
5: cbind_all(x)
4: bind_cols(...)
3: cbind(deparse.level, ...)
2: cbind(rownames = rownames(x), x)
1: write.xlsx(provFundedServiceDivision, file = "output/provFundedSD.xlsx",
sheetName = "provFundingSD")
eipi10 saved the day with his solution! I used the following code and everything worked fine:
write.xlsx(as.data.frame(provFundedServiceDivision),
file="output/provFundedSD.xlsx", sheetName="provFundingSD")
thanks to everyone for reading and helping me out. This is my first question on stack overflow. Cheers!
Use ungroup() at the end of your dplyr chain:
provFundedServiceDivision <- compensationBase2014 %>%
group_by(serviceDivision,provinciallyFunded) %>%
summarise(total=sum(fundingRaw)) %>%
mutate(percent = paste0(round(100 * total/sum(total),1), "%")) %>%
# Add ungroup to the end
ungroup()
#write to second sheet
write.xlsx(provFundedServiceDivision, file="output/provFundedSD.xlsx",
sheetName="provFundingSD")
Instead of...
class(provFundedServiceDivision)[1]
[1] "grouped_df"
You get...
class(provFundedServiceDivision)[1]
[1] "tbl_df"

get bounding box from ggmap object

I'm using ggmap library in R. I'm trying to download a rectangular map with it, but I know it'll give me a square. I only need the bounding box of the returned square.
library(ggmap)
map <- get_map(c(-65.7,-3.1,-64.4,-2.3),maptype="satellite",filename="map.png")
str(map)
chr [1:1280, 1:1280] "#294829" "#294829" "#2D512D" "#264425" ...
- attr(*, "class")= chr [1:2] "ggmap" "raster"
- attr(*, "bb")='data.frame': 1 obs. of 4 variables:
..$ ll.lat: num -3.14
..$ ll.lon: num -65.5
..$ ur.lat: num -2.26
..$ ur.lon: num -64.6
Object map have two classes "ggmap" and "raster". I can't use # or $ in it. How then can I access the ll.lat and other attributes from "bb" sub-object?
You can do this:
> attr(map, "bb")
ll.lat ll.lon ur.lat ur.lon
1 -3.139567 -65.48877 -2.261646 -64.60986

R dataframe define column names at creation

I get monthly price value for the two assets below from Yahoo:
if(!require("tseries") | !require(its) ) { install.packages(c("tseries", 'its')); require("tseries"); require(its) }
startDate <- as.Date("2000-01-01", format="%Y-%m-%d")
MSFT.prices = get.hist.quote(instrument="msft", start= startDate,
quote="AdjClose", provider="yahoo", origin="1970-01-01",
compression="m", retclass="its")
SP500.prices = get.hist.quote(instrument="^gspc", start=startDate,
quote="AdjClose", provider="yahoo", origin="1970-01-01",
compression="m", retclass="its")
I want to put these two into a single data frame with specified columnames (Pandas allows this now - a bit ironic since they take the data.frame concept from R). As below, I assign the two time series with names:
MSFTSP500.prices <- data.frame(msft = MSFT.prices, sp500= SP500.prices )
However, this does not preserve the column names [msft, snp500] I have appointed. I need to define column names in a separate line of code:
colnames(MSFTSP500.prices) <- c("msft", "sp500")
I tried to put colnames and col.names inside the data.frame() call but it doesn't work. How can I define column names while creating the data frame?
I found ?data.frame very unhelpful...
The code fails with an error message indicating no availability of as.its. So I added the missing code (which appears to have been successful after two failed attempts.) Once you issue the missing require() call you can use str to see what sort of object get.hist.quote actually returns. It is neither a dataframe nor a zoo object, although it resembles a zoo-object in many ways:
> str(SP500.prices)
Formal class 'its' [package "its"] with 2 slots
..# .Data: num [1:180, 1] 1394 1366 1499 1452 1421 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:180] "2000-01-02" "2000-01-31" "2000-02-29" "2000-04-02" ...
.. .. ..$ : chr "AdjClose"
..# dates: POSIXct[1:180], format: "2000-01-02 16:00:00" "2000-01-31 16:00:00" ...
If you run cbind on those two objects you get a regular matrix with dimnames:
> str(cbind(SP500.prices, MSFT.prices) )
num [1:180, 1:2] 1394 1366 1499 1452 1421 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:180] "2000-01-02" "2000-01-31" "2000-02-29" "2000-04-02" ...
..$ : chr [1:2] "AdjClose" "AdjClose"
You will still need to change the column names since there does not seem to be a cbind.its that lets you assign column-names. I would caution about using the data.frame method, since the object is might get confusing in its behavior:
> str( MSFTSP500.prices )
'data.frame': 180 obs. of 2 variables:
$ AdjClose :Formal class 'AsIs', 'its' [package ""] with 1 slot
.. ..# .S3Class: chr "AsIs" "its"
$ AdjClose.1:Formal class 'AsIs', 'its' [package ""] with 1 slot
.. ..# .S3Class: chr "AsIs" "its"
The columns are still S4 objects. I suppose that might be useful if you were going to pass them to other its-methods but could be confusing otherwise. This might be what you were shooting for:
> MSFTSP500.prices <- data.frame(msft = as.vector(MSFT.prices),
sp500= as.vector(SP500.prices) ,
row.names= as.character(MSFT.prices#dates) )
> str( MSFTSP500.prices )
'data.frame': 180 obs. of 2 variables:
$ msft : num 35.1 32 38.1 25 22.4 ...
$ sp500: num 1394 1366 1499 1452 1421 ...
> head(rownames(MSFTSP500.prices))
[1] "2000-01-02 16:00:00" "2000-01-31 16:00:00" "2000-02-29 16:00:00"
[4] "2000-04-02 17:00:00" "2000-04-30 17:00:00" "2000-05-31 17:00:00"
MSFT.prices is a zoo object, which seems to be a data-frame-alike, with its own column name which gets transferred to the object. Confer
tmp <- data.frame(a=1:10)
b <- data.frame(lost=tmp)
which loses the second column name.
If you do
MSFTSP500.prices <- data.frame(msft = as.vector(MSFT.prices),
sp500=as.vector(SP500.prices))
then you will get the colnames you want (though you won't get zoo-specific behaviours). Not sure why you object to renaming columns in a second command, though.

Fail to create couponbonds object in termstrc package using R

I am trying to use R package termstrc to estimate the term structure. To do that I have to prepare the data as the couponbonds class required by the package. I used some fake data to prevent the potential problem of the real data. Though I tried a lot, it still didn't work.
Any idea what is going wrong?
structure of the official demo data which works
data("govbonds")
str(govbonds)
List of 3
$ GERMANY:List of 8
..$ ISIN : chr [1:52] "DE0001141414" "DE0001137131" "DE0001141422" "DE0001137149" ...
..$ MATURITYDATE: Date[1:52], format: "2008-02-15" "2008-03-14" "2008-04-11" ...
..$ ISSUEDATE : Date[1:52], format: "2002-08-14" "2006-03-08" "2003-04-11" ...
..$ COUPONRATE : num [1:52] 0.0425 0.03 0.03 0.0325 0.0413 ...
..$ PRICE : num [1:52] 100 99.9 99.8 99.8 100.1 ...
..$ ACCRUED : num [1:52] 4.09 2.66 2.43 2.07 2.39 ...
..$ CASHFLOWS :List of 3
.. ..$ ISIN: chr [1:384] "DE0001141414" "DE0001137131" "DE0001141422" "DE0001137149" ...
.. ..$ CF : num [1:384] 104 103 103 103 104 ...
.. ..$ DATE: Date[1:384], format: "2008-02-15" "2008-03-14" "2008-04-11" ...
..$ TODAY : Date[1:1], format: "2008-01-30"
#another two are omitted here
- attr(*, "class")= chr "couponbonds"
> ns_res <- estim_nss(govbonds, c("GERMANY"), method = "ns",tauconstr=list(c(0.2, 5, 0.1)))
[1] "Searching startparameters for GERMANY"
beta0 beta1 beta2 tau1
5.008476 -1.092510 -3.209695 2.400100
my code to prepare fake data
bond=list()
bond$CHINA=list()
n=30*12#suppose I have n bond
enddate=as.Date('2014/11/7')
isin=sprintf('DE%010d',1:n)#some fake ISIN
bond$CHINA$ISIN=isin
bond$CHINA$MATURITYDATE=enddate+(1:n)*30
bond$CHINA$ISSUEDATE=rep(enddate,n)
bond$CHINA$COUPONRATE=rep(5/100,n)
bond$CHINA$PRICE=rep(100,n)
bond$CHINA$ACCRUED=rep(0,n)
bond$CHINA$CASHFLOWS=list()
bond$CHINA$CASHFLOWS$ISIN=isin
bond$CHINA$CASHFLOWS$CF=100+(1:n)*5/12
bond$CHINA$CASHFLOWS$DATE=enddate+(1:n)*30
bond$CHINA$TODAY=enddate
class(bond)='couponbonds'
ns_res <- estim_nss(bond, c("CHINA"), method = "ns",tauconstr=list(c(0.2, 5, 0.1)))
the output
Error in `colnames<-`(`*tmp*`, value = c("DE0000000001", "DE0000000002", :
attempt to set 'colnames' on an object with less than two dimensions
The problem was finally solved by adding one cashflow with amount zero to the CASHFLOW$CF.
Put it in another way, at least one bond should have at least two cashflows.
Then you may face another error caused by uniroot function. Be sure to only include the cashflow after TODAY. The termstrc doesn't filter the cashflow for you by using TODAY.

Resources