I'm trying to import data into a pandas dataframe object from an excel spreadsheet parsing dates. I'm using dayfirst however I still get an error XLDateAmbiguous (docs)
The dates are in a single column in the format 25/09/1990
Could somebody explain to me why this happening and how I can fix it? Thanks in advance.
Edit: It seems as though the problem is caused by xlrd attempting to parse a non-date column as a date even thought I've specified which column the dates are in. Unfortunately I don't know how to explicitly indicate that a column should not be parsed as dates. Does anybody have any ideas?
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
path6 = 'C:\\Users\\Site2_Homepage_2013-06-04.xlsx'
df8 = pd.io.excel.read_excel(path6, 'Site2_Homepage_2012_06_13', header=1, parse_dates=True, dayfirst=True)
XLDateAmbiguous Traceback (most recent call last)
<ipython-input-17-4a83d104ab72> in <module>()
4 path7 = 'C:\\Users\\Site4_Homepage_2013-06-04.xlsx'
5 path8 = 'C:\\Users\\Site7_Homepage_2013-06-04.xlsx'
----> 6 df8 = pd.io.excel.read_excel(path6, 'Site2_Homepage_2012_06_13', header=1, parse_dates=True, dayfirst=True)
7 df9 = pd.io.excel.read_excel(path7, 'Site4_Homepage_2012_06_13', header=1, parse_dates=[3], dayfirst=True)
8 df10 = pd.io.excel.read_excel(path8, 'Site7_Homepage_2012_06_13', header=1, parse_dates=[3], dayfirst=True)
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in read_excel(io, sheetname, **kwds)
101 engine = kwds.pop('engine', None)
102
--> 103 return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
104
105
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in parse(self, sheetname, header, skiprows, skip_footer, index_col, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, has_index_names, **kwds)
206 skip_footer=skip_footer,
207 convert_float=convert_float,
--> 208 **kwds)
209
210 def _should_parse(self, i, parse_cols):
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in _parse_excel(self, sheetname, header, skiprows, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, **kwds)
267 if parse_cols is None or should_parse[j]:
268 if typ == XL_CELL_DATE:
--> 269 dt = xldate_as_tuple(value, datemode)
270 # how to produce this first case?
271 if dt[0] < datetime.MINYEAR: # pragma: no cover
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\xlrd\xldate.pyc in xldate_as_tuple(xldate, datemode)
78
79 if xldays < 61 and datemode == 0:
---> 80 raise XLDateAmbiguous(xldate)
81
82 jdn = xldays + _JDN_delta[datemode]
XLDateAmbiguous: 15.3
I didn't manage to find a solution of this. In the end I had to use .csv versions of the files for the dates to parse correctly.
Related
Sympy is downloaded with anaconda. In a Jupyter notebook, I have 'from sympy import divisors' and I get this:
ModuleNotFoundError Traceback (most recent call last)
Cell In[12], line 1
----> 1 from sympy import divisors
File ~/Desktop/anaconda3/lib/python3.9/site-packages/sympy/__init__.py:107
70 from .assumptions import (AppliedPredicate, Predicate, AssumptionsContext,
71 assuming, Q, ask, register_handler, remove_handler, refine)
73 from .polys import (Poly, PurePoly, poly_from_expr, parallel_poly_from_expr,
74 degree, total_degree, degree_list, LC, LM, LT, pdiv, prem, pquo,
75 pexquo, div, rem, quo, exquo, half_gcdex, gcdex, invert,
(...)
104 laguerre_poly, apart, apart_list, assemble_partfrac_list, Options,
105 ring, xring, vring, sring, field, xfield, vfield, sfield)
--> 107 from .series import (Order, O, limit, Limit, gruntz, series, approximants,
108 residue, EmptySequence, SeqPer, SeqFormula, sequence, SeqAdd, SeqMul,
109 fourier_series, fps, difference_delta, limit_seq)
111 from .functions import (factorial, factorial2, rf, ff, binomial,
112 RisingFactorial, FallingFactorial, subfactorial, carmichael,
113 fibonacci, lucas, motzkin, tribonacci, harmonic, bernoulli, bell, euler,
(...)
132 Znm, elliptic_k, elliptic_f, elliptic_e, elliptic_pi, beta, mathieus,
133 mathieuc, mathieusprime, mathieucprime, riemann_xi, betainc, betainc_regularized)
135 from .ntheory import (nextprime, prevprime, prime, primepi, primerange,
136 randprime, Sieve, sieve, primorial, cycle_length, composite,
137 compositepi, isprime, divisors, proper_divisors, factorint,
(...)
148 continued_fraction_iterator, continued_fraction_reduce,
149 continued_fraction_convergents, continued_fraction, egyptian_fraction)
File ~/Desktop/anaconda3/lib/python3.9/site-packages/sympy/series/__init__.py:7
5 from .gruntz import gruntz
6 from .series import series
----> 7 from .approximants import approximants
8 from .residues import residue
9 from .sequences import SeqPer, SeqFormula, sequence, SeqAdd, SeqMul
ModuleNotFoundError: No module named 'sympy.series.approximants'
Any ideas on what I am doing wrong? Thank you.
Tried using the terminal to update, but I am garbage with anything beyond the basics.
Problem Summary
I am attempting to convert a.grib2 file representing a single day's worth of gridded radar rainfall data spanning the continental US, into a netcdf. When a .grib2 is missing timesteps, I am attempting to fill them in with NA values using xarray.Dataset.reindex before running xarray.Dataset.to_netcdf. However, after I've reindexed the dataset, the script fails due to a memory allocation error. It succeeds if I don't reindex. One clue could be in the fact that the dataset chunks are set to (70, 3500, 7000), but when ds.to_netcdf is called, the script fails because it's attempting to load a chunk with dimensions (210, 3500, 7000).
Accessing Full Reproducible Example
The code and data to reproduce my results can be downloaded from this Dropbox link. The code is also shown below followed by the outputs. Potentially relevant OS and environment information are shown below as well.
Code
#%% Import libraries
import time
start_time = time.time()
import xarray as xr
import cfgrib
from glob import glob
import pandas as pd
import dask
dask.config.set(**{'array.slicing.split_large_chunks': False}) # to silence warnings of loading large slice into memory
dask.config.set(scheduler='synchronous') # this forces single threaded computations (netcdfs can only be written serially)
#%% parameters
chnk_sz = "7000MB"
fl_out_nc = "out_netcdfs/20010101.nc"
fldr_in_grib = "in_gribs/20010101.grib2"
#%% loading and exporting dataset
ds = xr.open_dataset(fldr_in_grib, engine="cfgrib", chunks={"time":chnk_sz},
backend_kwargs={'indexpath': ''})
# reindex
start_date = pd.to_datetime('2001-01-01')
tstep = pd.Timedelta('0 days 00:05:00')
new_index = pd.date_range(start=start_date, end=start_date + pd.Timedelta(1, "day"),\
freq=tstep, inclusive='left')
ds = ds.reindex(indexers={"time":new_index})
ds = ds.unify_chunks()
ds = ds.chunk(chunks={'time':chnk_sz})
print("######## INSPECTING DATASET PRIOR TO WRITING TO NETCDF ########")
print(ds)
print(' ')
print("######## ERROR MESSAGE ########")
ds.to_netcdf(fl_out_nc, encoding= {"unknown":{"zlib":True}})
Outputs
######## INSPECTING DATASET PRIOR TO WRITING TO NETCDF ########
<xarray.Dataset>
Dimensions: (time: 288, latitude: 3500, longitude: 7000)
Coordinates:
* time (time) datetime64[ns] 2001-01-01 ... 2001-01-01T23:55:00
* latitude (latitude) float64 54.99 54.98 54.98 54.97 ... 20.03 20.02 20.01
* longitude (longitude) float64 230.0 230.0 230.0 ... 300.0 300.0 300.0
step timedelta64[ns] ...
surface float64 ...
valid_time (time) datetime64[ns] dask.array<chunksize=(288,), meta=np.ndarray>
Data variables:
unknown (time, latitude, longitude) float32 dask.array<chunksize=(70, 3500, 7000), meta=np.ndarray>
Attributes:
GRIB_edition: 2
GRIB_centre: 161
GRIB_centreDescription: 161
GRIB_subCentre: 0
Conventions: CF-1.7
institution: 161
history: 2022-09-10T14:50 GRIB to CDM+CF via cfgrib-0.9.1...
######## ERROR MESSAGE ########
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
d:\Dropbox\_Sharing\reprex\2022-9-9_writing_ncdf_fails\reprex\exporting_netcdfs_reduced.py in <cell line: 22>()
160 print(' ')
161 print("######## ERROR MESSAGE ########")
---> 162 ds.to_netcdf(fl_out_nc, encoding= {"unknown":{"zlib":True}})
File c:\Users\Daniel\anaconda3\envs\weather_gen_3\lib\site-packages\xarray\core\dataset.py:1882, in Dataset.to_netcdf(self, path, mode, format, group, engine, encoding, unlimited_dims, compute, invalid_netcdf)
1879 encoding = {}
1880 from ..backends.api import to_netcdf
-> 1882 return to_netcdf( # type: ignore # mypy cannot resolve the overloads:(
1883 self,
1884 path,
1885 mode=mode,
1886 format=format,
1887 group=group,
1888 engine=engine,
1889 encoding=encoding,
1890 unlimited_dims=unlimited_dims,
1891 compute=compute,
1892 multifile=False,
1893 invalid_netcdf=invalid_netcdf,
1894 )
File c:\Users\xxxxx\anaconda3\envs\weather_gen_3\lib\site-packages\xarray\backends\api.py:1219, in to_netcdf(dataset, path_or_file, mode, format, group, engine, encoding, unlimited_dims, compute, multifile, invalid_netcdf)
...
121 return arg
File <__array_function__ internals>:180, in where(*args, **kwargs)
MemoryError: Unable to allocate 19.2 GiB for an array with shape (210, 3500, 7000) and data type float32
Environment
windows 11 Home
xarray 2022.3.0
cfgrib 0.9.10.1
dask 2022.7.0
A functional workaround is to chunk by a dimension that is unchanged during reindexing. The following modification causes the script to run successfully:
ds = xr.open_dataset(
fldr_in_grib,
engine="cfgrib",
chunks={ "latitude": 875 },
backend_kwargs={ 'indexpath': '' }
)
I'm trying to modify date column.
Code is below:
sample = sample.withColumn('next_date', when(sample.next_date.isNull(), (sample['next_date'] + timedelta(days=1))).otherwise(sample['next_date']))
Its giving me following error:
AttributeError Traceback (most recent call last)
<ipython-input-127-dd09f90d8a49> in <module>()
6 sample = sample.withColumn('next_date', lead('date').over(windowSpecs))
7
----> 8 sample = sample.withColumn('next_date', when(sample.next_date.isNull(), (sample['next_date'] + timedelta(days=1))).otherwise(sample['next_date']))
9
10 sample = sample.withColumn('snapshot_date', lit(dt.datetime.now().strftime("%d-%m-%Y %H:%M")))
/usr/lib/spark/python/pyspark/sql/column.py in _(self, other)
108 def _(self, other):
109 jc = other._jc if isinstance(other, Column) else other
--> 110 njc = getattr(self._jc, name)(jc)
111 return Column(njc)
112 _.__doc__ = doc
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
802
803 args_command = "".join(
--> 804 [get_command_part(arg, self.pool) for arg in new_args])
805
806 command = proto.CALL_COMMAND_NAME +\
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_command_part(parameter, python_proxy_pool)
276 command_part += ";" + interface
277 else:
--> 278 command_part = REFERENCE_TYPE + parameter._get_object_id()
279
280 command_part += "\n"
AttributeError: 'datetime.timedelta' object has no attribute '_get_object_id'
How do I resolve this?
Thanks in advance!
I know this is very old, but I solved the issue doing this:
sample = sample.withColumn('next_date', when(sample.next_date.isNull(), date_add(col('next_date'), 1).otherwise(sample['next_date']))
Hope this helps someone!
I have a pyspark dataframe contains rows of data seperated by comma. I want to split each row and apply LabeledPoints method to it. Then covnert it to dataframe.
Here is my code
import os.path
from pyspark.mllib.regression import LabeledPoint
import numpy as np
file_name = os.path.join('databricks-datasets', 'cs190', 'data-001', 'millionsong.txt')
raw_data_df = sqlContext.read.load(file_name, 'text')
rdd = raw_data_df.rdd.map(lambda line: line.split(',')).map(lambda seq:LabeledPoints(seq[0],seq[1:])).toDF()
It gives the following error message after apply .DF().
---------------------------------------------------------------------------
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 38.0 failed 1 times, most recent failure: Lost task 0.0 in stage 38.0 (TID 44, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
Py4JJavaError Traceback (most recent call last)
<ipython-input-65-dc4d86a8ee45> in <module>()
----> 1 rdd = raw_data_df.rdd.map(lambda line: line.split(',')).map(lambda seq:LabeledPoints(seq[0],seq[1:])).toDF()
2 print(type(rdd))
3 #print(rdd.take(5))
/databricks/spark/python/pyspark/sql/context.py in toDF(self, schema, sampleRatio)
62 [Row(name=u'Alice', age=1)]
63 """
---> 64 return sqlContext.createDataFrame(self, schema, sampleRatio)
65
66 RDD.toDF = toDF
/databricks/spark/python/pyspark/sql/context.py in createDataFrame(self, data, schema, samplingRatio)
421
422 if isinstance(data, RDD):
--> 423 rdd, schema = self._createFromRDD(data, schema, samplingRatio)
424 else:
425 rdd, schema = self._createFromLocal(data, schema)
/databricks/spark/python/pyspark/sql/context.py in _createFromRDD(self, rdd, schema, samplingRatio)
Answer found:
rdd = raw_data_df.map(lambda row: row['value'].split(',')).map(lambda seq:LabeledPoint(float(seq[0]),seq[1:])).toDF()
Here, I need to specifically reference each line of text using row['value'], even though there is only one feature in the row.
I am trying to index a datetime that is being formed from 3 columns representing (year, dayofyear, and 2400hr time).
2014,323,1203,47.77,320.9
2014,323,1204,48.46,402.6
2014,323,1205,49.2,422.7
2014,323,1206,49.82,432.4
2014,323,1207,50.03,438.6
2014,323,1208,50.15,445.4
2014,323,1209,50.85,449.7
2014,323,1210,50.85,454.4
2014,323,1211,50.85,458.1
2014,323,1212,50.91,460.2
I am using the following code:
In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
In [2]:
def parse(yr, yearday, hrmn):
date_string = ' '.join([yr, yearday, hrmn])
print(date_string)
return datetime.strptime(date_string,"%Y %j %H%M")
In [3]:
df = pd.read_csv('home_prepped.dat', parse_dates={'datetime':[0,1,2]},
date_parser=parse, index_col='datetime', header=None)
I have had success bringing it in when the data was flawed (had extra data over DST change), and now that it is fixed (removed and stitched back together) I am having this error (in its entirety):
2014 92 2355
2014 92 2356
2014 92 2357
2014 92 2358
2014 92 2359
2014 92 2400
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-9c710834ee23> in <module>()
1
----> 2 df = pd.read_csv('home_prepped.dat', parse_dates={'datetime':[0,1,2]}, date_parser=parse, index_col='datetime', header=None)
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
463 skip_blank_lines=skip_blank_lines)
464
--> 465 return _read(filepath_or_buffer, kwds)
466
467 parser_f.__name__ = name
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
249 return parser
250
--> 251 return parser.read()
252
253 _parser_defaults = {
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
708 raise ValueError('skip_footer not supported for iteration')
709
--> 710 ret = self._engine.read(nrows)
711
712 if self.options.get('as_recarray'):
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
1209 data = dict((k, v) for k, (i, v) in zip(names, data))
1210
-> 1211 names, data = self._do_date_conversions(names, data)
1212 index, names = self._make_index(data, alldata, names)
1213
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _do_date_conversions(self, names, data)
1033 data, names = _process_date_conversion(
1034 data, self._date_conv, self.parse_dates, self.index_col,
-> 1035 self.index_names, names, keep_date_col=self.keep_date_col)
1036
1037 return names, data
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _process_date_conversion(data_dict, converter, parse_spec, index_col, index_names, columns, keep_date_col)
2100
2101 _, col, old_names = _try_convert_dates(converter, colspec,
-> 2102 data_dict, orig_names)
2103
2104 new_data[new_name] = col
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _try_convert_dates(parser, colspec, data_dict, columns)
2132 to_parse = [data_dict[c] for c in colnames if c in data_dict]
2133
-> 2134 new_col = parser(*to_parse)
2135 return new_name, new_col, colnames
2136
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in converter(*date_cols)
2048 dayfirst=dayfirst)
2049 except Exception:
-> 2050 return generic_parser(date_parser, *date_cols)
2051
2052 return converter
/Volumes/anaconda/anaconda/lib/python2.7/site-packages/pandas/io/date_converters.pyc in generic_parser(parse_func, *cols)
36 for i in range(N):
37 args = [c[i] for c in cols]
---> 38 results[i] = parse_func(*args)
39
40 return results
<ipython-input-2-57e18ddd7deb> in parse(yr, yearday, hrmn)
1 def parse(yr, yearday, hrmn):
2 date_string = ' '.join([yr, yearday, hrmn])
----> 3 return datetime.strptime(date_string,"%Y %j %H%M")
/Volumes/anaconda/anaconda/python.app/Contents/lib/python2.7/_strptime.pyc in _strptime(data_string, format)
326 if len(data_string) != found.end():
327 raise ValueError("unconverted data remains: %s" %
--> 328 data_string[found.end():])
329
330 year = None
ValueError: unconverted data remains: 0
I am looking for suggestions as to how to debug or work around this. I have gone through the data and according to what I have read in similar posts I should be looking for extraneous time data, which is not there.
Thanks.