Qore CsvUtil Module Reference  1.5
CsvUtil.qm.dox.h
1 // -*- mode: c++; indent-tabs-mode: nil -*-
2 // @file CsvUtil.qm Qore user module for working with CSV files
3 
4 /* CsvUtil.qm Copyright 2012 - 2016 Qore Technologies, sro
5 
6  Permission is hereby granted, free of charge, to any person obtaining a
7  copy of this software and associated documentation files (the "Software"),
8  to deal in the Software without restriction, including without limitation
9  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  and/or sell copies of the Software, and to permit persons to whom the
11  Software is furnished to do so, subject to the following conditions:
12 
13  The above copyright notice and this permission notice shall be included in
14  all copies or substantial portions of the Software.
15 
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  DEALINGS IN THE SOFTWARE.
23 */
24 
25 // minimum required Qore version
26 
27 // assume local var scope, do not use "$" for vars, members, and method calls
28 
29 
30 /* see release notes below for version history
31 */
32 
263 class CsvHelper {
264 
265 public:
266  private :
267  const C_OPT1 = 0x1;
268  const C_OPT2 = 0x2;
270  const Types = (
271  "int": True,
272  "*int": True,
273  "float": True,
274  "*float": True,
275  "number": True,
276  "*number": True,
277  "string": True,
278  "*string": True,
279  "date": True,
280  "*date": True,
281  );
282 
284  const FieldAttrs = ("type", "format", "timezone", "code", "header");
285 
287  bool tolwr = False;
288 
290  string date_format;
291 
293  hash m_specs;
294 
296  string errname;
297 
298  // reorder data according headers set by options.headers or read from CsvHeader
299  bool headerReorder = True;
300 
301 
302 public:
303 
305  constructor (string n_errname);
306 
307 
309  private bool isMultiType();
310 
311 
313  private checkType(string fld_errs, string key, string value);
314 
315 
316  // get spec from options.fields for old Csv. Check spec param for new Csv
317  private hash getSpec(*hash fields, string fld_errs, int C_OPTx);
318 
319 
320  private hash getSpec1(*hash fields);
321 
322 
323  private hash getSpec2(hash spec);
324 
325 
330  private list adjustFieldsFromHeaders(string type, *list headers);
331 
332 
333 }; // class CsvHelper
334 
336 namespace CsvUtil {
338  const EOL_UNIX = "\n";
340  const EOL_WIN = "\r\n";
342  const EOL_MACINTOSH = "\r";
343 
344  // helper list of end of line values
345  const EOLS = (EOL_UNIX, EOL_WIN, EOL_MACINTOSH, );
346 
348  const CSV_TYPE_UNKNOWN = "<unknown>";
350  const CSV_TYPE_SINGLE = "<single>";
351 
352 
354 
592 class AbstractCsvIterator : public Qore::AbstractIterator, private CsvHelper {
593 
594 public:
595  private :
597  const Options = (
598  "date_format": C_OPT1|C_OPT2,
599  "date-format": C_OPT1|C_OPT2,
600  "encoding": C_OPT1|C_OPT2,
601  "eol": C_OPT1|C_OPT2,
602  "extended_record": C_OPT2,
603  "fields": C_OPT1,
604  "header-lines": C_OPT1|C_OPT2,
605  "header_lines": C_OPT1|C_OPT2,
606  "header-names": C_OPT1|C_OPT2,
607  "header_names": C_OPT1|C_OPT2,
608  "header_reorder": C_OPT1|C_OPT2,
609  "headers": C_OPT1,
610  "ignore-empty": C_OPT1|C_OPT2,
611  "ignore_empty": C_OPT1|C_OPT2,
612  "ignore-whitespace": C_OPT1|C_OPT2,
613  "ignore_whitespace": C_OPT1|C_OPT2,
614  "quote": C_OPT1|C_OPT2,
615  "separator": C_OPT1|C_OPT2,
616  "timezone": C_OPT1|C_OPT2,
617  "tolwr": C_OPT1|C_OPT2,
618  "verify-columns": C_OPT1|C_OPT2,
619  "verify_columns": C_OPT1|C_OPT2,
620  );
621 
622  // field separator
623  string separator = ",";
624 
625  // field content delimiter
626  string quote = "\"";
627 
628  // number of header lines
629  softint headerLines = 0;
630 
631  // flag to use string names from the first header row if possible
632  bool headerNames = False;
633 
634  // True if empty lines should be ignored
635  bool ignoreEmptyLines = True;
636 
637  // Flag to trim the field content (trim leading and trailing whitespace) from unquoted fields
638  bool ignoreWhitespace = True;
639 
640  // the @ref Qore::TimeZone to use when parsing dates (default: current time zone)
641  *TimeZone timezone;
642 
643  // verify the column count for every row; if a row does not match, then throw a \c CSVFILEITERATOR-DATA-ERROR exception
644  bool checkElementCounts = False;
645 
646  // getRecord/getValue returns extended hash
647  bool extendedRecord = False;
648 
649  // column count for verifying column counts
650  int cc;
651 
652  // current record count for the index() method
653  int rc = 0;
654 
655  // to resolve record type by rules
656  hash m_resolve_by_rule;
657 
658  // to resolve record type by number of fields
659  hash m_resolve_by_count;
660 
661  // list of idx to field transformarions, in order of spec
662  hash m_resolve_by_idx;
663 
664 
665 public:
666 
668 
672  constructor(*hash opts);
673 
674 
676 
680  // NOTE: when declared as *hash then always calls this constructor
681  constructor(hash spec, hash opts);
682 
683 
685  private processCommonOptions(*hash opts, int C_OPTx);
686 
687 
689  private processSpec(hash spec);
690 
691 
693  private prepareFieldsFromHeaders(*list headers);
694 
695 
697  private *string getDataName();
698 
699 
701  private abstract int lineNumberImpl();
702 
704  private abstract string getLineValueImpl();
705 
707  private abstract bool nextLineImpl();
708 
710 
715  bool next();
716 
717 
719 
726  any memberGate(string name);
727 
728 
730 
741  hash getValue();
742 
743 
745 
758  hash getRecord(bool extended);
759 
760 
762 
773  hash getRecord();
774 
775 
777 
789  any getRecordList();
790 
791 
793 
800  string getSeparator();
801 
802 
804 
811  string getQuote();
812 
813 
815 
822  *list getHeaders();
823 
824 
826 
831  *list getHeaders(string type);
832 
833 
835 
846  int index();
847 
848 
850 
865  int lineNumber();
866 
867 
868  private any handleType(hash fh, *string val);
869 
870 
872  private list getLineAndSplit();
873 
874 
876 
883  string identifyType(list rec);
884 
885 
887 
894  private *string identifyTypeImpl(list rec);
895 
896 
898  private hash parseLine();
899 
900  };
901 
903 
909 
910 public:
912 
917  constructor(string path, *hash opts) ;
918 
920 
924  constructor(string path, hash spec, hash opts) ;
925 
926 
927  any memberGate(string name);
928 
929 
931  private *string getDataName();
932 
933 
935  private int lineNumberImpl();
936 
937 
939  private string getLineValueImpl();
940 
941 
943  private bool nextLineImpl();
944 
945  }; // CsvFileIterator class
946 
948 
953 class CsvDataIterator : public CsvUtil::AbstractCsvIterator,public DataLineIterator {
954 
955 public:
956 
958 
963  constructor(string data, *hash opts) ;
964 
965 
967 
971  constructor(string data, hash spec, hash opts) ;
972 
973 
974  any memberGate(string name);
975 
976 
978  private int lineNumberImpl();
979 
980 
982  private string getLineValueImpl();
983 
984 
986  private bool nextLineImpl();
987 
988  };
989 
991 
1100 class AbstractCsvWriter : private CsvHelper {
1101 
1102 public:
1103  private :
1105  const Options = (
1106  "block": C_OPT1|C_OPT2,
1107  "datamap": C_OPT1,
1108  "date_format": C_OPT1|C_OPT2,
1109  "date-format": C_OPT1|C_OPT2,
1110  "encoding": C_OPT1|C_OPT2,
1111  "eol": C_OPT1|C_OPT2,
1112  "fields": C_OPT1,
1113  "headers": C_OPT1,
1114  "header_reorder": C_OPT1,
1115  "info_log": C_OPT1|C_OPT2,
1116  "optimal_quotes": C_OPT1|C_OPT2,
1117  "optimal-quotes": C_OPT1|C_OPT2,
1118  "quote": C_OPT1|C_OPT2,
1119  "quote_escape": C_OPT1|C_OPT2,
1120  "separator": C_OPT1|C_OPT2,
1121  "verify_columns": C_OPT1|C_OPT2,
1122  "verify-columns": C_OPT1|C_OPT2,
1123  "write_headers": C_OPT1|C_OPT2,
1124  "write-headers": C_OPT1|C_OPT2,
1125  );
1126 
1128  string encoding;
1129 
1131  string separator = ",";
1132 
1134  string quote = "\"";
1135 
1137  string m_quoteEscapeChar = "\\";
1138 
1140  string eol = EOL_UNIX;
1141 
1144 
1146  int lineNo = 0;
1147 
1149  int block = 1000;
1150 
1153 
1156 
1159 
1161  *code info_log;
1162 
1165 
1168 
1169 public:
1170 
1172 
1178  constructor(string n_errname, *hash n_opts);
1179 
1180 
1182 
1190  constructor(string n_errname, hash spec, hash n_opts);
1191 
1192 
1194  private processCommonOptions(*hash n_opts, int C_OPTx);
1195 
1196 
1198  private processSpec();
1199 
1200 
1202  private writeHeaders();
1203 
1204 
1206 
1211  writeLine(list values);
1212 
1213 
1215 
1220  writeLine(hash values);
1221 
1222 
1224 
1230  writeLine(string type, list values);
1231 
1232 
1234 
1240  writeLine(string type, hash values);
1241 
1242 
1244 
1251  write(Qore::AbstractIterator iterator);
1252 
1253 
1255 
1262  write(Qore::SQL::SQLStatement iterator);
1263 
1264 
1266 
1273  write(list l);
1274 
1275 
1277  abstract private writeRawLine(list values);
1278 
1280 
1284  private string prepareRawLine(list values);
1285 
1286 
1287  }; // AbstractCsvWriter class
1288 
1291 
1292 public:
1293 
1294  private :
1295  // a file to write
1296  File file;
1297 
1298 public:
1299 
1301 
1309  constructor(string path, *hash opts) ;
1310 
1311 
1313 
1322  constructor(string path, hash spec, hash opts) ;
1323 
1324 
1325  private openFile(string path);
1326 
1327 
1328  private writeRawLine(list values);
1329 
1330 
1331  }; // CsvFileWriter
1332 
1335 
1336 public:
1337 
1338  private :
1339  // a csv content
1340  string content;
1341 
1342 public:
1343 
1345 
1350  constructor(*hash opts) ;
1351 
1352 
1354 
1360  constructor(hash spec, hash opts) ;
1361 
1362 
1363  private initContent();
1364 
1365 
1366  private writeRawLine(list values);
1367 
1368 
1370 
1379  string write(Qore::AbstractIterator iterator);
1380 
1381 
1383 
1392  string write(list l);
1393 
1394 
1396  string getContent();
1397 
1398 
1399  }; // CsvStringWriter
1400 
1401 }; // CsvUtil namespace
private writeRawLine(list values)
This method must be overridden in child classes to provide the output implementation.
bool write_headers
this flag determines if any stored headers are output
Definition: CsvUtil.qm.dox.h:1155
constructor(string data, *hash opts)
Creates the CsvDataIterator with the input data and optionally an option hash.
hash m_out_by_name
mapping output field by name
Definition: CsvUtil.qm.dox.h:1164
constructor(*hash opts)
creates the CsvStringWriter single-type mode with content in the memory
private string prepareRawLine(list values)
Prepare a string (line with EOF) with formatting and escaping.
int index()
Returns the row index being iterated, which does not necessarily correspond to the line number when t...
string write(Qore::AbstractIterator iterator)
Stream iterator and return a CSV-formatted output string.
the AbstractCsvIterator class is an abstract base class that allows abstract CSV data to be iterated ...
Definition: CsvUtil.qm.dox.h:592
private *string getDataName()
Returns the name of the input data.
private bool nextLineImpl()
Moves the current line / record position to the next line / record; returns False if there are no mor...
const True
const Options
valid options for the object (a hash for quick lookups of valid keys)
Definition: CsvUtil.qm.dox.h:597
hash m_out_by_idx
mapping output field by index
Definition: CsvUtil.qm.dox.h:1167
private processCommonOptions(*hash opts, int C_OPTx)
process common options and and assing internal fields
string getQuote()
Returns the current quote string.
private processSpec()
Process specification and set internal variable for mapping.
abstract private int lineNumberImpl()
Returns the current line number.
private writeRawLine(list values)
This method must be overridden in child classes to provide the output implementation.
bool checkElementCounts
verify the column count for every row; if a row does not match, then throw a CSVFILEITERATOR-DATA-ERR...
Definition: CsvUtil.qm.dox.h:1143
private int lineNumberImpl()
Returns the current line number.
constructor(string path, *hash opts)
Creates the CsvFileIterator in single-type mode with the path of the file to read and an option hash...
private hash parseLine()
Parses a line in the file and returns a processed list of the fields.
const False
private list getLineAndSplit()
Read line split by separator/quote into list.
string m_quoteEscapeChar
quote escape character
Definition: CsvUtil.qm.dox.h:1137
private *string getDataName()
Returns the name of the input data.
*code info_log
a closure/call reference for informational logging when using write(SQLStatement) ...
Definition: CsvUtil.qm.dox.h:1161
list list(...)
const Options
valid options for the object (a hash for quick lookups of valid keys)
Definition: CsvUtil.qm.dox.h:1105
write(Qore::AbstractIterator iterator)
Stream an iterator into the output.
The CsvFileIterator class allows CSV files to be iterated on a record basis.
Definition: CsvUtil.qm.dox.h:908
string getContent()
Get the current in-memory content as a string.
string eol
end of line sequence
Definition: CsvUtil.qm.dox.h:1140
string getSeparator()
Returns the current separator string.
any getRecordList()
Returns the current record as a list.
private processCommonOptions(*hash n_opts, int C_OPTx)
Process options and set internal variables.
The CsvStringWriter class for in-memory string CSV creation.
Definition: CsvUtil.qm.dox.h:1334
string separator
field separator
Definition: CsvUtil.qm.dox.h:1131
string quote
field content delimiter
Definition: CsvUtil.qm.dox.h:1134
any memberGate(string name)
Returns the given column value for the current row.
The AbstractCsvWriter class provides a parent for all CSV writers.
Definition: CsvUtil.qm.dox.h:1100
private processSpec(hash spec)
process specification and assing internal data for resolving
private writeHeaders()
Write csv headers.
string baseTemplate
base template for value format
Definition: CsvUtil.qm.dox.h:1152
const EOL_MACINTOSH
Old (pre-OSX) Macintosh end of line character sequence.
Definition: CsvUtil.qm.dox.h:342
const CSV_TYPE_UNKNOWN
Record type when non matching any type.
Definition: CsvUtil.qm.dox.h:348
abstract private string getLineValueImpl()
Returns the current line.
string type(any arg)
private bool nextLineImpl()
Moves the current line / record position to the next line / record; returns False if there are no mor...
const EOL_UNIX
Unix end of line character sequence (for new OS X too)
Definition: CsvUtil.qm.dox.h:338
*list getHeaders()
Returns the current record headers or NOTHING if no headers have been detected or saved yet...
private *string identifyTypeImpl(list rec)
Identify a input record, given the raw line string. This method performs a lookup to a precalculated ...
constructor(string n_errname, *hash n_opts)
Creates the AbstractCsvWriter in single-type mode.
private prepareFieldsFromHeaders(*list headers)
match headers provided at Csv header or in options, never called for multi-type because header_names ...
private int lineNumberImpl()
Returns the current line number; returns 0 if not pointing at any data.
int lineNo
the latest line number
Definition: CsvUtil.qm.dox.h:1146
writeLine(list values)
Write a line with a list of values; data are checked against column rules.
hash getValue()
Returns the current record as a hash.
private string getLineValueImpl()
Returns the current line trimmed of the EOL character(s)
const EOL_WIN
MS DOS/Windows end of line character sequence.
Definition: CsvUtil.qm.dox.h:340
constructor(*hash opts)
creates the AbstractCsvIterator with an option hash in single-type mode
The CsvDataIterator class allows arbitrary CSV string data to be iterated on a record basis...
Definition: CsvUtil.qm.dox.h:953
constructor(string path, *hash opts)
creates the CsvFileWriter in single-type mode with the path of the file to read and an optional optio...
int block
block size for bulk DML
Definition: CsvUtil.qm.dox.h:1149
string identifyType(list rec)
Identify a fixed-length line type using identifyTypeImpl(); may be overridden if necessary.
bool optimal_quotes
stores the optimal quotes option
Definition: CsvUtil.qm.dox.h:1158
string encoding
output file character encoding
Definition: CsvUtil.qm.dox.h:1128
the CsvUtil namespace contains all the objects in the CsvUtil module
Definition: CsvUtil.qm.dox.h:336
abstract private bool nextLineImpl()
Moves the current line / record position to the next line / record; returns False if there are no mor...
The CsvFileWriter class for safe CSV file creation.
Definition: CsvUtil.qm.dox.h:1290
hash hash(object obj)
private string getLineValueImpl()
Returns the current line trimmed of the EOL character(s)
abstract private writeRawLine(list values)
This method must be overridden in child classes to provide the output implementation.
bool next()
Moves the current line / record position to the next line / record; returns False if there are no mor...
const CSV_TYPE_SINGLE
Record type when multi-type is disabled.
Definition: CsvUtil.qm.dox.h:350
hash getRecord()
Returns the current record as a hash.
int lineNumber()
Returns the current iterator line number in the file (the first line is line 1) or 0 if not pointing ...