piplinedRDD can't convert to dataframe using toDF

piplinedRDD can't convert to dataframe using toDF - dictionary

I have a pyspark dataframe contains rows of data seperated by comma. I want to split each row and apply LabeledPoints method to it. Then covnert it to dataframe.
Here is my code
import os.path
from pyspark.mllib.regression import LabeledPoint
import numpy as np
file_name = os.path.join('databricks-datasets', 'cs190', 'data-001', 'millionsong.txt')
raw_data_df = sqlContext.read.load(file_name, 'text')
rdd = raw_data_df.rdd.map(lambda line: line.split(',')).map(lambda seq:LabeledPoints(seq[0],seq[1:])).toDF()
It gives the following error message after apply .DF().
---------------------------------------------------------------------------
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 38.0 failed 1 times, most recent failure: Lost task 0.0 in stage 38.0 (TID 44, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
Py4JJavaError Traceback (most recent call last)
<ipython-input-65-dc4d86a8ee45> in <module>()
----> 1 rdd = raw_data_df.rdd.map(lambda line: line.split(',')).map(lambda seq:LabeledPoints(seq[0],seq[1:])).toDF()
2 print(type(rdd))
3 #print(rdd.take(5))
/databricks/spark/python/pyspark/sql/context.py in toDF(self, schema, sampleRatio)
62 [Row(name=u'Alice', age=1)]
63 """
---> 64 return sqlContext.createDataFrame(self, schema, sampleRatio)
65
66 RDD.toDF = toDF
/databricks/spark/python/pyspark/sql/context.py in createDataFrame(self, data, schema, samplingRatio)
421
422 if isinstance(data, RDD):
--> 423 rdd, schema = self._createFromRDD(data, schema, samplingRatio)
424 else:
425 rdd, schema = self._createFromLocal(data, schema)
/databricks/spark/python/pyspark/sql/context.py in _createFromRDD(self, rdd, schema, samplingRatio)

Answer found:
rdd = raw_data_df.map(lambda row: row['value'].split(',')).map(lambda seq:LabeledPoint(float(seq[0]),seq[1:])).toDF()
Here, I need to specifically reference each line of text using row['value'], even though there is only one feature in the row.

Related

while trying to convert a string to date : An error occurred while calling o140.showString ; could not parse at index 0?

i have a column date in the format 1/1/15 (month / day / year) without leading zeros and 15 instead of 2015.
i tried
data = data.withColumn('date' , to_date(unix_timestamp(data['date'], 'MM-dd-yyyy').cast("timestamp")))
data.orderBy('date').show()
it gives NULL in the date column (maybe because of the year format; i tried with MM-dd-yy , M-d-yy too)
so i tried
data = data.withColumn('date' , regexp_replace('date', '15', '2015'))
data = data.withColumn('date' , regexp_replace('date', '/2015/', '-15-'))
data = data.withColumn('date' , regexp_replace('date' , '/' , '-'))
now I have the date as 1-1-2015 and then when i tried the code from above , it shows the following error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-39-470368900f3b> in <module>
----> 1 data.orderBy('date').show()
C:\Users\Admin\Anaconda3\lib\site-packages\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
438 """
439 if isinstance(truncate, bool) and truncate:
--> 440 print(self._jdf.showString(n, 20, vertical))
441 else:
442 print(self._jdf.showString(n, int(truncate), vertical))
C:\Users\Admin\Anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
C:\Users\Admin\Anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
C:\Users\Admin\Anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o140.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 34.0 failed 1 times, most recent failure: Lost task 1.0 in stage 34.0 (TID 54, DESKTOP-IQ36PJF, executor driver): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '5-17-2015' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.
at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)
at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)
at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:31)
at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:628)
at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$2(RDD.scala:1492)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1135)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.base/java.lang.Thread.run(Thread.java:844)
Caused by: java.time.format.DateTimeParseException: Text '5-17-2015' could not be parsed at index 0
at java.base/java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:2046)
at java.base/java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1874)
at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)
... 24 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2194)
at org.apache.spark.rdd.RDD.$anonfun$reduce$1(RDD.scala:1094)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.reduce(RDD.scala:1076)
at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$1(RDD.scala:1498)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1486)
at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:183)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:844)
Caused by: org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '5-17-2015' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.
at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)
at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)
at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:31)
at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:628)
at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$2(RDD.scala:1492)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1135)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
... 1 more
Caused by: java.time.format.DateTimeParseException: Text '5-17-2015' could not be parsed at index 0
at java.base/java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:2046)
at java.base/java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1874)
at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)
... 24 more
any help regarding this would be helpful!!!
thanks !

PySpark map datetime to DoW

I'm trying to map a column 'eventtimestamp' to its day of week with the following function:
from datetime import datetime
import calendar
from pyspark.sql.functions import UserDefinedFunction as udf
def toWeekDay(x):
v = int(datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S').strftime('%w'))
if v == 0:
v = 6
else:
v = v-1
return calendar.day_name[v]
and for my df trying to create a new column dow with UDF.
udf_toWeekDay = udf(lambda x: toWeekDay(x), StringType())
df = df.withColumn("dow",udf_toWeekDay('eventtimestamp'))
Yet, I'm getting error I do not understand at all. Firstly, it was complaining for inserting datetime.datetime into strptime instead of string. So I parsed to str and now I don't have a clue what's wrong.
Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-9040214714346906648.py", line 267, in <module>
raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-9040214714346906648.py", line 260, in <module>
exec(code)
File "<stdin>", line 10, in <module>
File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 429, in take
return self.limit(num).collect()
File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 391, in collect
port = self._jdf.collectToPython()
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o6250.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1107.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1107.0 (TID 63757, ip-172-31-27-113.eu-west-1.compute.internal, executor 819): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
Thanks a lot for clues!

we can use date_format to get dayofweek,
df = df.withColumn("dow",date_format(df['eventtimestamp'],'EEEE'))

Restore vgg16 network in tensorflow

This one has been giving me a headache for quite some time now, even though it seems to be very basic.
I have the vgg16 network downloaded as a .cpkt
(from https://github.com/tensorflow/models/blob/master/slim/README.md#Pretrained)
Now what I want to do is loading for example the tensor of the first convolution layer of this network as an array in R.
I tried
restorer = tf$train$Saver()
sess = tf$Session()
restorer$restore(sess, "/home/beheerder/R/vgg_16.ckpt")
But then I do not see any variables apearing in my enviroment.
I'm working in R, but an awnser in Python is OK as well, as I can probably translate it to R.

Saver takes the variables to restore in constructor. In other words, you have to create the variables before you can restore them. Here is the example from Saver's doc:
v1 = tf.Variable(..., name='v1')
v2 = tf.Variable(..., name='v2')
# Pass the variables as a dict:
saver = tf.train.Saver({'v1': v1, 'v2': v2})
# Or pass them as a list.
saver = tf.train.Saver([v1, v2])
If you were to run the first line of your code in python you would get:
In [1]: import tensorflow as tf
In [2]: saver = tf.train.Saver()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-18da33d742f9> in <module>()
----> 1 saver = tf.train.Saver()
/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.pyc in __init__(self, var_list, reshape, sharded, max_to_keep, keep_checkpoint_every_n_hours, name, restore_sequentially, saver_def, builder, defer_build, allow_empty, write_version, pad_step_number)
1054 self._pad_step_number = pad_step_number
1055 if not defer_build:
-> 1056 self.build()
1057 if self.saver_def:
1058 self._check_saver_def()
/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.pyc in build(self)
1075 return
1076 else:
-> 1077 raise ValueError("No variables to save")
1078 self._is_empty = False
1079 self.saver_def = self._builder.build(
ValueError: No variables to save
You can see how model variables are created before being restored in the 20 lines starting from https://github.com/tensorflow/models/blob/master/slim/train_image_classifier.py#L338
This code gets executed if you make a call to train_image_classifier.py similar to the flower example in https://github.com/tensorflow/models/blob/master/slim/README.md#fine-tuning-a-model-from-an-existing-checkpoint

Issues running rmagic in IPython - Yosemite

I'm currently having issues trying to load rmagic into IPython since I upgraded to OSX Yosemite.
I'm using the following command:
%load_ext rmagic
Below is the error I'm getting, and I'm not sure if it's because I upgraded to Yosemite or because I'm trying to load R incorrectly. I've pasted all the Traceback in case that's useful, but the error ends in
ImportError: cannot import name conversion
My R version is 3.1.1
My Python version is 2.7.6
My IPython version is 2.3.0
Not sure what version rpy2 is, but I only downloaded it a few days ago so I assume it's the latest.
ImportError Traceback (most recent call last)
<ipython-input-7-691c6d73b073> in <module>()
----> 1 get_ipython().magic(u'load_ext rpy2.ipython')
/Library/Python/2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2203 magic_name, _, magic_arg_s = arg_s.partition(' ')
2204 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2205 return self.run_line_magic(magic_name, magic_arg_s)
2206
2207 #-------------------------------------------------------------------------
/Library/Python/2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2124 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2125 with self.builtin_trap:
-> 2126 result = fn(*args,**kwargs)
2127 return result
2128
/Library/Python/2.7/site-packages/IPython/core/magics/extension.pyc in load_ext(self, module_str)
/Library/Python/2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
/Library/Python/2.7/site-packages/IPython/core/magics/extension.pyc in load_ext(self, module_str)
61 if not module_str:
62 raise UsageError('Missing module name.')
---> 63 res = self.shell.extension_manager.load_extension(module_str)
64
65 if res == 'already loaded':
/Library/Python/2.7/site-packages/IPython/core/extensions.pyc in load_extension(self, module_str)
96 if module_str not in sys.modules:
97 with prepended_to_syspath(self.ipython_extension_dir):
---> 98 __import__(module_str)
99 mod = sys.modules[module_str]
100 if self._call_load_ipython_extension(mod):
/Library/Python/2.7/site-packages/rpy2/ipython/__init__.py in <module>()
----> 1 from .rmagic import load_ipython_extension
/Library/Python/2.7/site-packages/rpy2/ipython/rmagic.py in <module>()
51
52 import rpy2.rinterface as ri
---> 53 import rpy2.robjects as ro
54 import rpy2.robjects.packages as rpacks
55
/Library/Python/2.7/site-packages/rpy2/robjects/__init__.py in <module>()
16 import rpy2.rlike.container as rlc
17
---> 18 from rpy2.robjects.robject import RObjectMixin, RObject
19 from rpy2.robjects.vectors import *
20 from rpy2.robjects.functions import Function, SignatureTranslatedFunction
/Library/Python/2.7/site-packages/rpy2/robjects/robject.py in <module>()
5 rpy2.rinterface.initr()
6
----> 7 from . import conversion
8
9 class RObjectMixin(object):
ImportError: cannot import name conversion

Minrk had it right - uninstalling and reinstalling rpy2 fixed my problem. Not sure why I didn't think of that before! Thanks.

XLDateAmbiguous error even when using dayfirst argument

I'm trying to import data into a pandas dataframe object from an excel spreadsheet parsing dates. I'm using dayfirst however I still get an error XLDateAmbiguous (docs)
The dates are in a single column in the format 25/09/1990
Could somebody explain to me why this happening and how I can fix it? Thanks in advance.
Edit: It seems as though the problem is caused by xlrd attempting to parse a non-date column as a date even thought I've specified which column the dates are in. Unfortunately I don't know how to explicitly indicate that a column should not be parsed as dates. Does anybody have any ideas?
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
path6 = 'C:\\Users\\Site2_Homepage_2013-06-04.xlsx'
df8 = pd.io.excel.read_excel(path6, 'Site2_Homepage_2012_06_13', header=1, parse_dates=True, dayfirst=True)
XLDateAmbiguous Traceback (most recent call last)
<ipython-input-17-4a83d104ab72> in <module>()
4 path7 = 'C:\\Users\\Site4_Homepage_2013-06-04.xlsx'
5 path8 = 'C:\\Users\\Site7_Homepage_2013-06-04.xlsx'
----> 6 df8 = pd.io.excel.read_excel(path6, 'Site2_Homepage_2012_06_13', header=1, parse_dates=True, dayfirst=True)
7 df9 = pd.io.excel.read_excel(path7, 'Site4_Homepage_2012_06_13', header=1, parse_dates=[3], dayfirst=True)
8 df10 = pd.io.excel.read_excel(path8, 'Site7_Homepage_2012_06_13', header=1, parse_dates=[3], dayfirst=True)
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in read_excel(io, sheetname, **kwds)
101 engine = kwds.pop('engine', None)
102
--> 103 return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
104
105
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in parse(self, sheetname, header, skiprows, skip_footer, index_col, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, has_index_names, **kwds)
206 skip_footer=skip_footer,
207 convert_float=convert_float,
--> 208 **kwds)
209
210 def _should_parse(self, i, parse_cols):
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in _parse_excel(self, sheetname, header, skiprows, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, **kwds)
267 if parse_cols is None or should_parse[j]:
268 if typ == XL_CELL_DATE:
--> 269 dt = xldate_as_tuple(value, datemode)
270 # how to produce this first case?
271 if dt[0] < datetime.MINYEAR: # pragma: no cover
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\xlrd\xldate.pyc in xldate_as_tuple(xldate, datemode)
78
79 if xldays < 61 and datemode == 0:
---> 80 raise XLDateAmbiguous(xldate)
81
82 jdn = xldays + _JDN_delta[datemode]
XLDateAmbiguous: 15.3

I didn't manage to find a solution of this. In the end I had to use .csv versions of the files for the dates to parse correctly.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

piplinedRDD can't convert to dataframe using toDF - dictionary

Answer found: rdd = raw_data_df.map(lambda row: row['value'].split(',')).map(lambda seq:LabeledPoint(float(seq[0]),seq[1:])).toDF() Here, I need to specifically reference each line of text using row['value'], even though there is only one feature in the row.

Related

while trying to convert a string to date : An error occurred while calling o140.showString ; could not parse at index 0?

PySpark map datetime to DoW

Restore vgg16 network in tensorflow

Issues running rmagic in IPython - Yosemite

XLDateAmbiguous error even when using dayfirst argument

Categories

Resources