From 5f6e5ae9fb7caba4c4b63691c382e21928f0007c Mon Sep 17 00:00:00 2001 From: ephphatha Date: Sun, 27 Aug 2023 20:42:59 +1000 Subject: [PATCH] Refactor iterator parsing to allow more reuse --- Packaging/resources/assets/txtdata/Readme.md | 60 ++- Source/data/file.cpp | 101 +++- Source/data/file.hpp | 98 +++- Source/data/iterators.hpp | 473 +++++++++++------- Source/data/parser.cpp | 41 +- Source/data/parser.hpp | 66 +-- Source/playerdat.cpp | 176 ++----- test/Fixtures.cmake | 4 + test/data_file_test.cpp | 291 +++++++++-- test/fixtures/txtdata/empty.tsv | 0 test/fixtures/txtdata/empty_with_utf8_bom.tsv | 1 + test/fixtures/txtdata/lf_no_trail.tsv | 4 + test/fixtures/txtdata/sample.tsv | 2 + 13 files changed, 882 insertions(+), 435 deletions(-) create mode 100644 test/fixtures/txtdata/empty.tsv create mode 100644 test/fixtures/txtdata/empty_with_utf8_bom.tsv create mode 100644 test/fixtures/txtdata/lf_no_trail.tsv create mode 100644 test/fixtures/txtdata/sample.tsv diff --git a/Packaging/resources/assets/txtdata/Readme.md b/Packaging/resources/assets/txtdata/Readme.md index 5510cdb90..dd1c8cb66 100644 --- a/Packaging/resources/assets/txtdata/Readme.md +++ b/Packaging/resources/assets/txtdata/Readme.md @@ -21,28 +21,49 @@ A formal description of the format using The first record in a file SHOULD be used as a header Implementations are free to treat records however they want, using multiple records as headers or none at all. - Files SHOULD contain at least one record. - An empty file (zero-length or containing only a UTF8 BOM sequence) is - valid and is typically interpreted as a header for a single unnamed column - with no records. + All files contain at least one record. The last record in a file SHOULD end + with a trailing newline, however some spreadsheet applications do not + output trailing newlines so we allow flexibility here. This means that an + empty file (zero-length or containing only a UTF8 BOM sequence) is valid + and is typically interpreted as a header for a single unnamed column with + no records. Records SHOULD contain the same number of fields. - Files SHOULD end with a trailing newline. - Note that while a trailing newline is treated as if the file has a final - record containing a single empty field, in practice these records SHOULD - be discarded. This is mainly to handle the typical output from spreadsheet - applications like Excel. */ -DXTxtFile ::= utf8bom? header ( recordsep, record )* +DXTxtFile ::= utf8bom? ( singleRecord | record+ finalRecord? ) -utf8bom ::= #xEF #xBB #xBF +utf8bom ::= #xEF #xBB #xBF -header ::= record +/* + if parsing reaches EOF and the file has no record terminators then the file + MUST be treated as containing a single record with no terminator +*/ +singleRecord ::= fields + +/* + if for some reason you want to end a file with a record containing a single + empty field then the record MUST end with a valid terminator +*/ +record ::= fields recordterm + +/* + this means that the terminator is only truly optional for files where the + final record contains a single field with at least one character, or at least + two fields (as there will then be at least one field separator even if both + fields are zero-length) +*/ +finalRecord ::= nonEmptyField | field fieldsep fields /* an empty line is treated as a single empty field */ -record ::= field ( fieldsep field )* +fields ::= field ( fieldsep field )* /* fields MAY be zero length */ -field ::= character* +field ::= character* + +/* + unless the final record only consists of a single field, in which case it + MUST contain at least one character +*/ +nonEmptyField ::= character+ /* Any Char (see https://www.w3.org/TR/xml/#NT-Char) except Tab ("\t", 0x09), @@ -50,13 +71,16 @@ field ::= character* field value. For maximum portability characters in the discouraged ranges described in the linked section SHOULD NOT be used */ -character ::= [^#x9#xA#xD] +character ::= [^#x9#xA#xD] /* fields MUST be separated by tabs */ -fieldsep ::= #x9 +fieldsep ::= #x9 -/* records MUST be separated by line feeds, a cr/lf pair MAY be used */ -recordsep ::= #xD? #xA +/* + records (other than the final record or the only record in a single-record + file) MUST be terminated by line feeds, a cr/lf pair MAY be used +*/ +recordterm ::= #xD? #xA ``` ## File Descriptions diff --git a/Source/data/file.cpp b/Source/data/file.cpp index 0e6b847bf..242f0d920 100644 --- a/Source/data/file.cpp +++ b/Source/data/file.cpp @@ -1,6 +1,9 @@ #include "file.hpp" +#include + #include "engine/assets.hpp" +#include "utils/language.h" namespace devilution { tl::expected DataFile::load(std::string_view path) @@ -13,9 +16,103 @@ tl::expected DataFile::load(std::string_view path) std::unique_ptr data { new char[size] }; { AssetHandle handle = OpenAsset(std::move(ref)); - if (!handle.ok() || !handle.read(data.get(), size)) - return tl::unexpected { Error::ReadError }; + if (!handle.ok()) + return tl::unexpected { Error::OpenFailed }; + if (size > 0 && !handle.read(data.get(), size)) + return tl::unexpected { Error::BadRead }; } return DataFile { std::move(data), size }; } + +void DataFile::reportFatalError(Error code, std::string_view fileName) +{ + switch (code) { + case Error::NotFound: + case Error::OpenFailed: + case Error::BadRead: + app_fatal(fmt::format(fmt::runtime(_( + /* TRANSLATORS: Error message when a data file is missing or corrupt. Arguments are {file name} */ + "Unable to load data from file {0}")), + fileName)); + case Error::NoContent: + app_fatal(fmt::format(fmt::runtime(_( + /* TRANSLATORS: Error message when a data file is empty or only contains the header row. Arguments are {file name} */ + "{0} is incomplete, please check the file contents.")), + fileName)); + case Error::NotEnoughColumns: + app_fatal(fmt::format(fmt::runtime(_( + /* TRANSLATORS: Error message when a data file doesn't contain the expected columns. Arguments are {file name} */ + "Your {0} file doesn't have the expected columns, please make sure it matches the documented format.")), + fileName)); + } +} + +void DataFile::reportFatalFieldError(std::errc code, std::string_view fileName, std::string_view fieldName, const DataFileField &field) +{ + switch (code) { + case std::errc::invalid_argument: + app_fatal(fmt::format(fmt::runtime(_( + /* TRANSLATORS: Error message when parsing a data file and a text value is encountered when a number is expected. Arguments are {found value}, {column heading}, {file name}, {row/record number}, {column/field number} */ + "Non-numeric value {0} for {1} in {2} at row {3} and column {4}")), + field.currentValue(), fieldName, fileName, field.row(), field.column())); + case std::errc::result_out_of_range: + app_fatal(fmt::format(fmt::runtime(_( + /* TRANSLATORS: Error message when parsing a data file and we find a number larger than expected. Arguments are {found value}, {column heading}, {file name}, {row/record number}, {column/field number} */ + "Out of range value {0} for {1} in {2} at row {3} and column {4}")), + field.currentValue(), fieldName, fileName, field.row(), field.column())); + default: + app_fatal(fmt::format(fmt::runtime(_( + /* TRANSLATORS: Fallback error message when an error occurs while parsing a data file and we can't determine the cause. Arguments are {file name}, {row/record number}, {column/field number} */ + "Unexpected error while reading {0} at row {1} and column {2}")), + fileName, field.row(), field.column())); + } +} + +tl::expected DataFile::parseHeader(ColumnDefinition *begin, ColumnDefinition *end, tl::function_ref(std::string_view)> mapper) +{ + std::bitset::max()> seenColumns; + unsigned lastColumn = 0; + + RecordIterator firstRecord { data(), data() + size(), false }; + for (DataFileField field : *firstRecord) { + if (begin == end) { + // All key columns have been identified + break; + } + + auto mapResult = mapper(*field); + if (!mapResult.has_value()) { + // not a key column + continue; + } + + uint8_t columnType = mapResult.value(); + if (seenColumns.test(columnType)) { + // Repeated column? unusual, maybe this should be an error + continue; + } + seenColumns.set(columnType); + + unsigned skipColumns = 0; + if (field.column() > lastColumn) + skipColumns = field.column() - lastColumn - 1; + lastColumn = field.column(); + + *begin = { columnType, skipColumns }; + ++begin; + } + + // Incrementing the iterator causes it to read to the end of the record in case we broke early (maybe there were extra columns) + ++firstRecord; + if (firstRecord == this->end()) { + return tl::unexpected { Error::NoContent }; + } + + body_ = firstRecord.data(); + + if (begin != end) { + return tl::unexpected { Error::NotEnoughColumns }; + } + return {}; +} } // namespace devilution diff --git a/Source/data/file.hpp b/Source/data/file.hpp index b1842862d..6c182ae36 100644 --- a/Source/data/file.hpp +++ b/Source/data/file.hpp @@ -4,17 +4,59 @@ #include #include +#include #include "iterators.hpp" namespace devilution { + +struct ColumnDefinition { + uint8_t type; + + enum class Error { + UnknownColumn + }; + + // The number of fields between this column and the last one identified as important (or from start of the record if this is the first column we care about) + unsigned skipLength = 0; + + ColumnDefinition() + : type(std::numeric_limits::max()) + { + } + + ColumnDefinition(unsigned type) + : type(type) + { + } + + ColumnDefinition(unsigned type, unsigned skipLength) + : type(type) + , skipLength(skipLength) + { + } + + bool operator==(const ColumnDefinition &other) const + { + return type == other.type && skipLength == other.skipLength; + } + + template + explicit operator T() const + { + return static_cast(type); + } +}; + /** * @brief Container for a tab-delimited file following the TSV-like format described in txtdata/Readme.md */ class DataFile { - std::unique_ptr data_; + std::unique_ptr data_; std::string_view content_; + const char *body_; + DataFile() = delete; /** @@ -22,19 +64,24 @@ class DataFile { * @param data pointer to the raw data backing the view (this container will take ownership to ensure the lifetime of the view) * @param size total number of bytes/code units including the BOM if present */ - DataFile(std::unique_ptr &&data, size_t size) + DataFile(std::unique_ptr &&data, size_t size) : data_(std::move(data)) , content_(data_.get(), size) { constexpr std::string_view utf8BOM = "\xef\xbb\xbf"; if (this->content_.starts_with(utf8BOM)) this->content_.remove_prefix(utf8BOM.size()); + + body_ = this->content_.data(); } public: enum class Error { NotFound, - ReadError + OpenFailed, + BadRead, + NoContent, + NotEnoughColumns }; /** @@ -46,19 +93,52 @@ public: */ static tl::expected load(std::string_view path); - [[nodiscard]] RecordsRange records() const + static void reportFatalError(Error code, std::string_view fileName); + static void reportFatalFieldError(std::errc code, std::string_view fileName, std::string_view fieldName, const DataFileField &field); + + void resetHeader() { - return content_; + body_ = content_.data(); } - [[nodiscard]] const char *begin() const + /** + * @brief Attempts to parse the first row/record in the file, populating the range defined by [begin, end) using the provided mapping function + * + * This method will also set an internal marker so that future uses of the begin iterator skip the header line. + * @param begin Start of the destination range + * @param end End of the destination range + * @param mapper Function that maps from a string_view to a unique numeric identifier for the column + * @return If the file ends after the header or not enough columns were defined this function returns an error code describing the failure. + */ + [[nodiscard]] tl::expected parseHeader(ColumnDefinition *begin, ColumnDefinition *end, tl::function_ref(std::string_view)> mapper); + + /** + * @brief Templated version of parseHeader(uint8_t) to allow using directly with enum definitions of columns + * @tparam T An enum or any type that defines operator uint8_t() + * @param begin Start of the destination range + * @param end End of the destination range + * @param typedMapper Function that maps from a string_view to a unique T value + * @return A void success result or an error code as described above + */ + template + [[nodiscard]] tl::expected parseHeader(ColumnDefinition *begin, ColumnDefinition *end, std::function(std::string_view)> typedMapper) { - return content_.data(); + return parseHeader(begin, end, [typedMapper](std::string_view label) { return typedMapper(label).transform([](T value) { return static_cast(value); }); }); + } + + [[nodiscard]] RecordIterator begin() const + { + return { body_, data() + size(), body_ != data() }; } - [[nodiscard]] const char *end() const + [[nodiscard]] RecordIterator end() const { - return content_.data() + content_.size(); + return {}; + } + + [[nodiscard]] const char *data() const + { + return content_.data(); } [[nodiscard]] size_t size() const diff --git a/Source/data/iterators.hpp b/Source/data/iterators.hpp index c9a8937c2..4bbf35e79 100644 --- a/Source/data/iterators.hpp +++ b/Source/data/iterators.hpp @@ -1,234 +1,365 @@ #pragma once #include +#include + +#include #include "parser.hpp" namespace devilution { -class FieldsInRecordRange { + +class DataFileField { GetFieldResult *state_; - const char *const end_; + const char *end_; + unsigned row_; + unsigned column_; public: - FieldsInRecordRange(GetFieldResult *state, const char *end) + DataFileField(GetFieldResult *state, const char *end, unsigned row, unsigned column) : state_(state) , end_(end) + , row_(row) + , column_(column) { } - class Iterator { - GetFieldResult *state_; - const char *const end_; + /** + * @brief Returns a view of the current field + * + * This method scans the current field if this is the first value access since the last + * advance. If you expect the field to contain a numeric value then calling parseInt first + * is more efficient, but calling the methods in either order is supported. + * @return The current field value (may be an empty string) or a zero length string_view + */ + [[nodiscard]] std::string_view value() + { + if (state_->status == GetFieldResult::Status::ReadyToRead) { + *state_ = GetNextField(state_->next, end_); + } + return state_->value; + } - public: - using iterator_category = std::input_iterator_tag; - using value_type = std::string_view; + /** + * Convenience function to let DataFileField instances be used like other single-value STL containers + */ + [[nodiscard]] std::string_view operator*() + { + return this->value(); + } - Iterator() - : state_(nullptr) - , end_(nullptr) - { + /** + * @brief Attempts to parse the current field as a numeric value using std::from_chars + * + * You can freely interleave this method with calls to operator*. If this is the first value + * access since the last advance this will scan the current field and store it for later + * use with operator* or repeated calls to parseInt (even with different types). + * @tparam T an Integral type supported by std::from_chars + * @param destination value to store the result of successful parsing + * @return the error code from std::from_chars + */ + template + [[nodiscard]] std::errc parseInt(T &destination) + { + std::from_chars_result result {}; + if (state_->status == GetFieldResult::Status::ReadyToRead) { + const char *begin = state_->next; + result = std::from_chars(begin, end_, destination); + if (result.ec != std::errc::invalid_argument) { + // from_chars was able to consume at least one character, consume the rest of the field + *state_ = GetNextField(result.ptr, end_); + // and prepend what was already parsed + state_->value = { begin, (state_->value.data() - begin) + state_->value.size() }; + } + } else { + result = std::from_chars(state_->value.data(), end_, destination); } + return result.ec; + } - Iterator(GetFieldResult *state, const char *end) - : state_(state) - , end_(end) - { - state_->status = GetFieldResult::Status::ReadyToRead; + template + [[nodiscard]] tl::expected asInt() + { + T value = 0; + auto parseIntResult = parseInt(value); + if (parseIntResult == std::errc()) { + return value; } + return tl::unexpected { parseIntResult }; + } - [[nodiscard]] bool operator==(const Iterator &rhs) const - { - if (state_ == nullptr && rhs.state_ == nullptr) - return true; + /** + * Returns the current row number + */ + [[nodiscard]] unsigned row() const + { + return row_; + } - return state_ != nullptr && rhs.state_ != nullptr && state_->next == rhs.state_->next; - } + /** + * Returns the current column/field number (from the start of the row/record) + */ + [[nodiscard]] unsigned column() const + { + return column_; + } - [[nodiscard]] bool operator!=(const Iterator &rhs) const - { - return !(*this == rhs); - } + /** + * Allows accessing the value of this field in a const context + * + * This requires an actual non-const value access to happen first before it returns + * any useful results, intended for use in error reporting (or test output). + */ + [[nodiscard]] std::string_view currentValue() const + { + return state_->value; + } +}; - /** - * Advances to the next field in the current record - */ - Iterator &operator++() - { - return *this += 1; - } +/** + * @brief Show the field value along with the row/column number (mainly used in test failure messages) + * @param stream output stream, expected to have overloads for unsigned, std::string_view, and char* + * @param field Object to display + * @return the stream, to allow chaining + */ +inline std::ostream &operator<<(std::ostream &stream, const DataFileField &field) +{ + return stream << "\"" << field.currentValue() << "\" (at row " << field.row() << ", column " << field.column() << ")"; +} + +class FieldIterator { + GetFieldResult *state_; + const char *const end_; + const unsigned row_; + unsigned column_ = 0; - /** - * @brief Advances by the specified number of fields - * - * if a non-zero increment is provided and advancing the iterator causes it to reach the end - * of the record the iterator is invalidated. It will compare equal to an end iterator and - * cannot be used for value access or any further parsing - * @param increment how many fields to advance (can be 0) - * @return self-reference - */ - Iterator &operator+=(unsigned increment) - { - if (state_->status == GetFieldResult::Status::ReadyToRead) - *state_ = DiscardMultipleFields(state_->next, end_, increment); - - if (state_->endOfRecord()) { - state_ = nullptr; - } else { - // We use Status::ReadyToRead as a marker so we only read the next value on the next call to operator* - // this allows consumers to read from the input stream without using operator* if they want - state_->status = GetFieldResult::Status::ReadyToRead; - } - return *this; - } +public: + using iterator_category = std::input_iterator_tag; + using value_type = DataFileField; - /** - * @brief Returns a view of the current field - * - * This method scans the current field if this is the first value access since the last - * advance. You can repeatedly call operator* safely but you must not try to use parseInt - * after calling this method. If you calling parseInt and get an invalid_argument result - * back you can use this method to get the actual field value, but if you received any - * other result then parseInt has consumed the field and this method is no longer reliable. - * @return The current field value (may be an empty string) or a zero length string_view - */ - [[nodiscard]] value_type operator*() - { - if (state_->status == GetFieldResult::Status::ReadyToRead) { - *state_ = GetNextField(state_->next, end_); - } - return state_->value; - } + FieldIterator() + : state_(nullptr) + , end_(nullptr) + , row_(0) + { + } - /** - * @brief Attempts to parse a field as a numeric value using std::from_chars - * - * This method is NOT repeatable. In addition to the behaviour of std::from_chars please - * heed the following warnings. If the field contains a value larger than will fit into the - * given type parsing will fail with std::errc::out_of_range and the field will be - * discarded. If a field starts with some digits then is followed by other characters the - * remainder will also be discarded. You can only get a useful value out of operator* if - * the result code is std::errc::invalid_argument. - * @tparam T an Integral type supported by std::from_chars - * @param destination value to store the result of successful parsing - * @return the error code from std::from_chars - */ - template - [[nodiscard]] std::errc parseInt(T &destination) - { - auto result = std::from_chars(state_->next, end_, destination); - if (result.ec != std::errc::invalid_argument) { - // from_chars was able to consume at least one character, so discard the rest of the field - *state_ = DiscardField(result.ptr, end_); - } - return result.ec; + FieldIterator(GetFieldResult *state, const char *end, unsigned row) + : state_(state) + , end_(end) + , row_(row) + { + state_->status = GetFieldResult::Status::ReadyToRead; + } + + [[nodiscard]] bool operator==(const FieldIterator &rhs) const + { + if (state_ == nullptr && rhs.state_ == nullptr) + return true; + + return state_ != nullptr && rhs.state_ != nullptr && state_->next == rhs.state_->next; + } + + [[nodiscard]] bool operator!=(const FieldIterator &rhs) const + { + return !(*this == rhs); + } + + /** + * Advances to the next field in the current record + */ + FieldIterator &operator++() + { + return *this += 1; + } + + /** + * @brief Advances by the specified number of fields + * + * if a non-zero increment is provided and advancing the iterator causes it to reach the end + * of the record the iterator is invalidated. It will compare equal to an end iterator and + * cannot be used for value access or any further parsing + * @param increment how many fields to advance (can be 0) + * @return self-reference + */ + FieldIterator &operator+=(unsigned increment) + { + if (increment == 0) + return *this; + + if (state_->status == GetFieldResult::Status::ReadyToRead) { + // We never read the value and no longer need it, discard it so that we end up + // advancing past the field delimiter (as if a value access had happened) + *state_ = DiscardField(state_->next, end_); } - /** - * @brief Checks if this field is the last field in a file, not just in the record - * - * If you call this method before calling parseInt or operator* then this will trigger a - * read, meaning you can no longer use parseInt to try extract a numeric value. You - * probably want to use this after calling parseInt. - * @return true if this field is the last field in the file/RecordsRange - */ - [[nodiscard]] bool endOfFile() const - { - if (state_->status == GetFieldResult::Status::ReadyToRead) { - *state_ = GetNextField(state_->next, end_); - } - return state_->endOfFile(); + if (state_->endOfRecord()) { + state_ = nullptr; + } else { + unsigned fieldsSkipped = 0; + // By this point we've already advanced past the end of this field (either because the + // last value access found the end of the field by necessity or we discarded it a few + // lines up), so we only need to advance further if an increment greater than 1 was + // provided. + *state_ = DiscardMultipleFields(state_->next, end_, increment - 1, &fieldsSkipped); + // As we've consumed the current field by this point we need to increment the internal + // column counter one extra time so we have an accurate value. + column_ += fieldsSkipped + 1; + // We use Status::ReadyToRead as a marker so we only read the next value on the next + // value access, this allows consumers to choose the most efficient method (e.g. if + // they want the value as an int) or even repeated advances without using a value. + state_->status = GetFieldResult::Status::ReadyToRead; } - }; + return *this; + } - [[nodiscard]] Iterator begin() const + /** + * @brief Returns a view of the current field + * + * The returned value is a thin wrapper over the current state of this iterator (or last + * successful read if incrementing this iterator would result in it reaching the end state). + */ + [[nodiscard]] value_type operator*() { - return { state_, end_ }; + return { state_, end_, row_, column_ }; } - [[nodiscard]] Iterator end() const + /** + * @brief Returns the current row number + */ + [[nodiscard]] unsigned row() const { - return {}; + return row_; + } + /** + * @brief Returns the current column/field number (from the start of the row/record) + */ + [[nodiscard]] unsigned column() const + { + return column_; } }; -class RecordsRange { - const char *const begin_; +class DataFileRecord { + GetFieldResult *state_; const char *const end_; + const unsigned row_; public: - RecordsRange(std::string_view characters) - : begin_(characters.data()) - , end_(characters.data() + characters.size()) + DataFileRecord(GetFieldResult *state, const char *end, unsigned row) + : state_(state) + , end_(end) + , row_(row) { } - class Iterator { - GetFieldResult state_; - const char *const end_; + [[nodiscard]] FieldIterator begin() + { + return { state_, end_, row_ }; + } - public: - using iterator_category = std::forward_iterator_tag; - using value_type = FieldsInRecordRange; + [[nodiscard]] FieldIterator end() const + { + return {}; + } - Iterator() - : state_(nullptr, GetFieldResult::Status::EndOfFile) - , end_(nullptr) - { - } + [[nodiscard]] unsigned row() const + { + return row_; + } +}; - Iterator(const char *begin, const char *end) - : state_(begin) - , end_(end) - { - } +class RecordIterator { + GetFieldResult state_; + const char *const end_; + unsigned row_ = 0; - [[nodiscard]] bool operator==(const Iterator &rhs) const - { - return state_.next == rhs.state_.next; - } +public: + using iterator_category = std::forward_iterator_tag; + using value_type = DataFileRecord; - [[nodiscard]] bool operator!=(const Iterator &rhs) const - { - return !(*this == rhs); - } + RecordIterator() + : state_(nullptr, GetFieldResult::Status::EndOfFile) + , end_(nullptr) + { + } - Iterator &operator++() - { - return *this += 1; - } + RecordIterator(const char *begin, const char *end, bool skippedHeader) + : state_(begin) + , end_(end) + , row_(skippedHeader ? 1 : 0) + { + } - Iterator &operator+=(unsigned increment) - { - if (!state_.endOfRecord()) - state_ = DiscardRemainingFields(state_.next, end_); - - if (state_.endOfFile()) { - state_.next = nullptr; - } else { - if (increment > 0) - state_ = DiscardMultipleRecords(state_.next, end_, increment - 1); - // We use Status::ReadyToRead as a marker in case the Field iterator is never - // used, so the next call to operator+= will advance past the current record - state_.status = GetFieldResult::Status::ReadyToRead; - } + [[nodiscard]] bool operator==(const RecordIterator &rhs) const + { + return state_.next == rhs.state_.next; + } + + [[nodiscard]] bool operator!=(const RecordIterator &rhs) const + { + return !(*this == rhs); + } + + RecordIterator &operator++() + { + return *this += 1; + } + + RecordIterator &operator+=(unsigned increment) + { + if (increment == 0) return *this; + + if (!state_.endOfRecord()) { + // The field iterator either hasn't been used or hasn't consumed the entire record + state_ = DiscardRemainingFields(state_.next, end_); } - [[nodiscard]] value_type operator*() - { - return { &state_, end_ }; + if (state_.endOfFile()) { + state_.next = nullptr; + } else { + unsigned recordsSkipped = 0; + // By this point we've already advanced past the end of this record (either because the + // last value access found the end of the record by necessity or we discarded any + // leftovers a few lines up), so we only need to advance further if an increment + // greater than 1 was provided. + state_ = DiscardMultipleRecords(state_.next, end_, increment - 1, &recordsSkipped); + // As we've consumed the current record by this point we need to increment the internal + // row counter one extra time so we have an accurate value. + row_ += recordsSkipped + 1; + // We use Status::ReadyToRead as a marker in case the DataFileField iterator is never + // used, so the next call to operator+= will advance past the current record + state_.status = GetFieldResult::Status::ReadyToRead; } - }; + return *this; + } - [[nodiscard]] Iterator begin() const + [[nodiscard]] DataFileRecord operator*() { - return { begin_, end_ }; + return { &state_, end_, row_ }; } - [[nodiscard]] Iterator end() const + /** + * @brief Exposes the current location of this input iterator. + * + * This is only expected to be used internally so the DataFile instance knows where the header + * ends and the body begins. You probably don't want to use this directly. + */ + [[nodiscard]] const char *data() const { - return {}; + return state_.next; + } + + /** + * @brief Returns the current row/record number (from the start of the file) + * + * The header row is always considered row 0, however if you've called DataFile.parseHeader() + * before calling DataFile.begin() then you'll get row 1 as the first record of the range. + */ + [[nodiscard]] unsigned row() const + { + return row_; } }; } // namespace devilution diff --git a/Source/data/parser.cpp b/Source/data/parser.cpp index 805d30bbb..1ad3ad69e 100644 --- a/Source/data/parser.cpp +++ b/Source/data/parser.cpp @@ -1,10 +1,10 @@ #include "parser.hpp" namespace devilution { -GetFieldResult HandleRecordSeparator(const char *begin, const char *end) +GetFieldResult HandleRecordTerminator(const char *begin, const char *end) { if (begin == end) { - return { end, GetFieldResult::Status::EndOfFile }; + return { end, GetFieldResult::Status::NoFinalTerminator }; } if (*begin == '\r') { @@ -15,36 +15,49 @@ GetFieldResult HandleRecordSeparator(const char *begin, const char *end) // carriage returns should be followed by a newline, so let's let the following checks handle it } if (*begin == '\n') { - return { begin + 1, GetFieldResult::Status::EndOfRecord }; + ++begin; + if (begin == end) { + return { end, GetFieldResult::Status::EndOfFile }; + } + + return { begin, GetFieldResult::Status::EndOfRecord }; } - return { begin, GetFieldResult::Status::BadRecordSeparator }; + return { begin, GetFieldResult::Status::BadRecordTerminator }; } -GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength) +GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength, unsigned *fieldsSkipped) { GetFieldResult result { begin }; - while (skipLength > 0) { + unsigned skipCount = 0; + while (skipCount < skipLength) { + ++skipCount; result = DiscardField(result.next, end); if (result.endOfRecord()) { - // Found the end of record early, we can reuse the error code so just return it - return result; + // Found the end of record early + break; } - --skipLength; + } + if (fieldsSkipped != nullptr) { + *fieldsSkipped = skipCount; } return result; } -GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength) +GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength, unsigned *recordsSkipped) { GetFieldResult result { begin }; - while (skipLength > 0) { + unsigned skipCount = 0; + while (skipCount < skipLength) { + ++skipCount; result = DiscardRemainingFields(result.next, end); if (result.endOfFile()) { - // Found the end of file early, we can reuse the error code so just return it - return result; + // Found the end of file early + break; } - --skipLength; + } + if (recordsSkipped != nullptr) { + *recordsSkipped = skipCount; } return result; } diff --git a/Source/data/parser.hpp b/Source/data/parser.hpp index b8c4cd6d6..90ce4d9ef 100644 --- a/Source/data/parser.hpp +++ b/Source/data/parser.hpp @@ -16,9 +16,10 @@ struct GetFieldResult { ReadyToRead, EndOfField, EndOfRecord, - BadRecordSeparator, + BadRecordTerminator, EndOfFile, - FileTruncated + FileTruncated, + NoFinalTerminator } status = Status::ReadyToRead; @@ -53,7 +54,7 @@ struct GetFieldResult { */ [[nodiscard]] bool endOfRecord() const { - return IsAnyOf(status, Status::EndOfRecord, Status::BadRecordSeparator) || endOfFile(); + return IsAnyOf(status, Status::EndOfRecord, Status::BadRecordTerminator) || endOfFile(); } /** @@ -61,16 +62,16 @@ struct GetFieldResult { */ [[nodiscard]] bool endOfFile() const { - return IsAnyOf(status, Status::EndOfFile, Status::FileTruncated); + return IsAnyOf(status, Status::EndOfFile, Status::FileTruncated, Status::NoFinalTerminator); } }; /** - * @brief Checks if this character is potentially part of a record separator sequence + * @brief Checks if this character is potentially part of a record terminator sequence * @param c character to check - * @return true if it's a record separator (lf) or carriage return (cr, accepted as the start of a crlf pair) + * @return true if it's a record terminator (lf) or carriage return (cr, accepted as the start of a crlf pair) */ -constexpr bool IsRecordSeparator(char c) +constexpr bool IsRecordTerminator(char c) { return c == '\r' || c == '\n'; } @@ -78,34 +79,33 @@ constexpr bool IsRecordSeparator(char c) /** * @brief Checks if this character is a field separator * - * Note that record separator sequences also act to delimit fields + * Note that record terminator sequences also act to separate fields * @param c character to check - * @return true if it's a field separator (tab) or part of a record separator sequence + * @return true if it's a field separator (tab) or part of a record terminator sequence */ constexpr bool IsFieldSeparator(char c) { - return c == '\t' || IsRecordSeparator(c); + return c == '\t' || IsRecordTerminator(c); } /** - * @brief Consumes the current record separator sequence and returns a result describing whether at least one more record is available. + * @brief Consumes the current record terminator sequence and returns a result describing whether at least one more record is available. * - * Assumes that begin points to a record separator (cr, lf, or EOF). If we reached the end of the - * stream (endOfField == end) then `status` will compare equal to - * GetFieldResult::Status::EndOfFile. Otherwise if a valid record separator sequence was found (lf - * or crlf) then the `status` equals GetFieldResult::Status::EndOfRecord. If we found a carriage - * return (cr) character just before the end of the stream then it's likely the file was - * truncated, `status` will contain GetFieldResult::Status::FileTruncated. If we found a carriage - * return that is followed by any character other than a newline (lf), or `begin` didn't point to - * a record separator, then `status` will be GetFieldResult::Status::BadRecordSeparator and the - * file has probably been mangled. - * @param begin start of a stream (expected to be pointing to a record separator) + * Assumes that begin points to a record terminator (lf or crlf) or the last read reached the end + * of the final record (in which case the terminator is optional). If we reached the end of the + * stream (`begin == end`) then `status` will compare equal to Status::NoFinalTerminator. If we + * found a carriage return (cr) character just before the end of the stream then it's likely the + * file was truncated, `status` will contain Status::FileTruncated. If we found a carriage return + * that is followed by any character other than a newline (lf), or `begin` didn't point to a record + * terminator, then `status` will be Status::BadRecordTerminator and the file has probably been + * mangled. Otherwise `status` will be Status::EndOfRecord at least one more record is available or + * Status::EndOfFile if this was the end of the last record. + * @param begin start of a stream (expected to be pointing to a record terminator) * @param end one past the last character of the stream * @return a struct containing a pointer to the start of the next record (if more characters are - * available) or a copy of end, and optionally a status code describing what type of - * separator was found. + * available) or a copy of end, and a status code describing the terminator. */ -GetFieldResult HandleRecordSeparator(const char *begin, const char *end); +GetFieldResult HandleRecordTerminator(const char *begin, const char *end); /** * @brief Consumes the current field (or record) separator and returns a result describing whether at least one more field is available. @@ -113,7 +113,7 @@ GetFieldResult HandleRecordSeparator(const char *begin, const char *end); * Assumes that begin points to a field or record separator (tab, cr, lf, or EOF). If there are * more fields in the current record then the return value will have a pointer to the start of the * next field and `status` will be GetFieldResult::Status::EndOfField. Otherwise refer to - * HandleRecordSeparator for a description of the different codes. + * HandleRecordTerminator for a description of the different codes. * @param begin start of a stream (expected to be pointing to a record separator) * @param end one past the last character of the stream * @return a struct containing a pointer to the start of the next field (if more characters are @@ -126,7 +126,7 @@ inline GetFieldResult HandleFieldSeparator(const char *begin, const char *end) return { begin + 1, GetFieldResult::Status::EndOfField }; } - return HandleRecordSeparator(begin, end); + return HandleRecordTerminator(begin, end); } /** @@ -148,20 +148,24 @@ inline GetFieldResult DiscardField(const char *begin, const char *end) * @param begin first character of the stream * @param end one past the last character in the stream * @param skipLength how many fields to skip (specifying 0 will cause the method to return without advancing) + * @param fieldsSkipped optional output parameter, will be filled with a count of how many fields + * were skipped in case the end of record was reached early * @return a GetFieldResult struct containing an empty value, a pointer to the start of the next * field/record, and a status code describing what type of separator was found */ -GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength); +GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength, unsigned *fieldsSkipped = nullptr); /** * @brief Advances by the specified number of records or until the end of the file, whichever occurs first * @param begin first character of the stream * @param end one past the last character in the stream * @param skipLength how many records to skip (specifying 0 will cause the method to return without advancing) + * @param recordsSkipped optional output parameter, will be filled with a count of how many records + * were skipped in case the end of file was reached early * @return a GetFieldResult struct containing an empty value, a pointer to the start of the next * record, and a status code describing what type of separator was found */ -GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength); +GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength, unsigned *recordsSkipped = nullptr); /** * @brief Discard any remaining fields in the current record @@ -172,9 +176,9 @@ GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsign */ inline GetFieldResult DiscardRemainingFields(const char *begin, const char *end) { - const char *nextSeparator = std::find_if(begin, end, IsRecordSeparator); + const char *nextSeparator = std::find_if(begin, end, IsRecordTerminator); - return HandleRecordSeparator(nextSeparator, end); + return HandleRecordTerminator(nextSeparator, end); } /** @@ -185,7 +189,7 @@ inline GetFieldResult DiscardRemainingFields(const char *begin, const char *end) * `next` member of the returned type will be either the start of the next field/record or `end`. * The `status` member contains additional information to distinguish between the end of a field * and the end of a record. If there are additional fields in this record then `status` will be - * GetFieldResult::Status::EndOfField, otherwise refer to HandleRecordSeparator for the meanings + * GetFieldResult::Status::EndOfField, otherwise refer to HandleRecordTerminator for the meanings * associated with the remaining codes. * @param begin first character of the stream * @param end one past the last character in the stream diff --git a/Source/playerdat.cpp b/Source/playerdat.cpp index 508a609d4..99b93b960 100644 --- a/Source/playerdat.cpp +++ b/Source/playerdat.cpp @@ -45,7 +45,7 @@ public: [[nodiscard]] uint32_t getThresholdForLevel(unsigned level) const { if (level > 0) - return levelThresholds[std::min(level - 1, getMaxLevel())]; + return levelThresholds[std::min(level - 1, getMaxLevel())]; return 0; } @@ -64,167 +64,76 @@ public: } } ExperienceData; -struct ExperienceColumnDefinition { - enum class ColumnType { - Level, - Experience, - LAST = Experience - } type; - - enum class Error { - UnknownColumn - }; - - // The number of fields between this column and the last one identified as important (or from start of the record if this is the first column we care about) - unsigned skipLength; - - static tl::expected mapNameToType(std::string_view name) - { - if (name == "Level") { - return ColumnType::Level; - } - if (name == "Experience") { - return ColumnType::Experience; - } - return tl::unexpected { Error::UnknownColumn }; - } - - ExperienceColumnDefinition() = delete; - - ExperienceColumnDefinition(const ColumnType &type) - : type(type) - , skipLength(0) - { - } +enum class ExperienceColumn { + Level, + Experience, + LAST = Experience +}; - ExperienceColumnDefinition(const ColumnType &type, unsigned skipLength) - : type(type) - , skipLength(skipLength) - { +tl::expected mapExperienceColumnFromName(std::string_view name) +{ + if (name == "Level") { + return ExperienceColumn::Level; } - - bool operator==(const ExperienceColumnDefinition &other) const - { - return type == other.type && skipLength == other.skipLength; + if (name == "Experience") { + return ExperienceColumn::Experience; } -}; + return tl::unexpected { ColumnDefinition::Error::UnknownColumn }; +} void ReloadExperienceData() { constexpr std::string_view filename = "txtdata\\Experience.tsv"; auto dataFileResult = DataFile::load(filename); if (!dataFileResult.has_value()) { - app_fatal(fmt::format(fmt::runtime(_(/* TRANSLATORS: Error message when a data file is missing or corrupt */ "Unable to load player experience data from file {:s}")), filename)); + DataFile::reportFatalError(dataFileResult.error(), filename); } - const DataFile &dataFile = dataFileResult.value(); - - constexpr unsigned ExpectedColumnCount = enum_size::value; - - StaticVector columns; - std::bitset seenColumns; - - unsigned currentColumn = 0; - unsigned lastColumn = 0; - RecordsRange records = dataFile.records(); - auto record = records.begin(); - auto endRecord = records.end(); - for (std::string_view field : *record) { - if (columns.size() >= ExpectedColumnCount) { - // All key columns have been identified - break; - } + DataFile &dataFile = dataFileResult.value(); - auto columnType = ExperienceColumnDefinition::mapNameToType(field); - if (columnType.has_value() && !seenColumns.test(static_cast(columnType.value()))) { - seenColumns.set(static_cast(columnType.value())); - unsigned skipColumns = 0; - if (currentColumn > lastColumn) - skipColumns = currentColumn - lastColumn - 1; - columns.emplace_back(columnType.value(), skipColumns); - lastColumn = currentColumn; - } - ++currentColumn; - } - ++record; - - if (record == endRecord) { - // The data file ended after the header, since there's no data we can't proceed - app_fatal(fmt::format(fmt::runtime(_( - /* TRANSLATORS: Error message when a data file is empty or only contains the header row */ - "{:s} is incomplete, please check the file contents.")), - filename)); - } + constexpr unsigned ExpectedColumnCount = enum_size::value; + + std::array columns; + auto parseHeaderResult = dataFile.parseHeader(columns.data(), columns.data() + columns.size(), mapExperienceColumnFromName); - if (columns.size() < ExpectedColumnCount) { - // The data file doesn't have the required headers. Though we could potentially just allocate - // missing columns in the default order that's likely to lead to further corruption in saves - app_fatal(fmt::format(fmt::runtime(_( - /* TRANSLATORS: Error message when a data file doesn't contain the expected columns */ - "Your {:s} file doesn't have the expected columns, please make sure it matches the documented format.")), - filename)); + if (!parseHeaderResult.has_value()) { + DataFile::reportFatalError(parseHeaderResult.error(), filename); } ExperienceData.clear(); - unsigned row = 1; // current line/record number for error messages - while (record != endRecord) { + for (DataFileRecord record : dataFile) { uint8_t level = 0; uint32_t experience = 0; bool skipRecord = false; - FieldsInRecordRange fields = *record; - auto field = fields.begin(); - auto endField = fields.end(); - unsigned col = 0; // current field number for error messages + FieldIterator fieldIt = record.begin(); + FieldIterator endField = record.end(); for (auto &column : columns) { - col += column.skipLength; - field += column.skipLength; + fieldIt += column.skipLength; - if (field == endField) { - // reached the end of record early, this could be from a trailing newline so don't throw an error - skipRecord = true; - break; + if (fieldIt == endField) { + DataFile::reportFatalError(DataFile::Error::NotEnoughColumns, filename); } - switch (column.type) { - case ExperienceColumnDefinition::ColumnType::Level: { + DataFileField field = *fieldIt; + + switch (static_cast(column)) { + case ExperienceColumn::Level: { auto parseIntResult = field.parseInt(level); - if (parseIntResult == std::errc::invalid_argument) { - // not a signless numeric value, is this a trailing newline or the MaxLevel line? - if (field.endOfFile() || *field == "MaxLevel") { + + if (parseIntResult != std::errc()) { + if (*field == "MaxLevel") { skipRecord = true; } else { - app_fatal(fmt::format(fmt::runtime(_( - /* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Level column */ - "Expected a positive numeric value for Level in {:s}, found {:s} at row {:d} and column {:d}")), - filename, *field, row, col)); + DataFile::reportFatalFieldError(parseIntResult, filename, "Level", field); } - } else if (parseIntResult == std::errc::result_out_of_range) { - // a level greater than 255 was provided - app_fatal(fmt::format(fmt::runtime(_( - /* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Level column */ - "Levels above {:d} are not supported, out of range value in {:s} at row {:d} and column {:d}")), - std::numeric_limits::max(), filename, row, col)); } } break; - case ExperienceColumnDefinition::ColumnType::Experience: { + case ExperienceColumn::Experience: { auto parseIntResult = field.parseInt(experience); - if (parseIntResult == std::errc::invalid_argument) { - // not a signless numeric value, is this a trailing newline? - if (field.endOfFile()) { - skipRecord = true; - } else { - app_fatal(fmt::format(fmt::runtime(_( - /* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Experience column */ - "Expected a positive numeric value for Experience in {:s}, found {:s} at row {:d} and column {:d}")), - filename, *field, row, col)); - } - } else if (parseIntResult == std::errc::result_out_of_range) { - // an experience threshold greater than 2^32-1 was provided - app_fatal(fmt::format(fmt::runtime(_( - /* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Experience column */ - "Experience thresholds above {:d} are not supported, out of range value in {:s} at row {:d} and column {:d}")), - std::numeric_limits::max(), filename, row, col)); + + if (parseIntResult != std::errc()) { + DataFile::reportFatalFieldError(parseIntResult, filename, "Experience", field); } } break; @@ -235,14 +144,11 @@ void ReloadExperienceData() if (skipRecord) break; - ++field; - ++col; + ++fieldIt; } if (!skipRecord) ExperienceData.setThresholdForLevel(level, experience); - ++record; - ++row; } } diff --git a/test/Fixtures.cmake b/test/Fixtures.cmake index 01121b65c..0fcae3e16 100644 --- a/test/Fixtures.cmake +++ b/test/Fixtures.cmake @@ -86,7 +86,11 @@ set(devilutionx_fixtures timedemo/WarriorLevel1to2/spawn_0.sv txtdata/cr.tsv txtdata/crlf.tsv + txtdata/empty.tsv + txtdata/empty_with_utf8_bom.tsv txtdata/lf.tsv + txtdata/lf_no_trail.tsv + txtdata/sample.tsv txtdata/utf8_bom.tsv ) diff --git a/test/data_file_test.cpp b/test/data_file_test.cpp index 298d03818..720586d4b 100644 --- a/test/data_file_test.cpp +++ b/test/data_file_test.cpp @@ -22,14 +22,15 @@ void TestFileContents( GetFieldResult::Status expectedEndOfRecordStatus = GetFieldResult::Status::EndOfRecord, GetFieldResult::Status expectedEndOfFileStatus = GetFieldResult::Status::EndOfFile) { - GetFieldResult result { dataFile.begin() }; + GetFieldResult result { dataFile.data() }; + const char *end = dataFile.data() + dataFile.size(); unsigned row = 0; do { ASSERT_LT(row, expectedContent.size()) << "Too many records"; unsigned col = 0; do { ASSERT_LT(col, expectedContent[row].size()) << "Too many fields in record " << row; - result = GetNextField(result.next, dataFile.end()); + result = GetNextField(result.next, end); EXPECT_EQ(result.value, expectedContent[row][col]) << "Unexpected value at record " << row << " and field " << col; col++; } while (!result.endOfRecord()); @@ -65,10 +66,9 @@ TEST(DataFileTest, LoadCRFile) { "", "2", "3" }, { "1", "2", "" }, { "1", "", "3" } - // because the file does not end with a newline parsing stops at the previous record }; - TestFileContents(dataFile, expectedFields, GetFieldResult::Status::BadRecordSeparator, GetFieldResult::Status::FileTruncated); + TestFileContents(dataFile, expectedFields, GetFieldResult::Status::BadRecordTerminator, GetFieldResult::Status::FileTruncated); } TEST(DataFileTest, LoadWindowsFile) @@ -84,7 +84,6 @@ TEST(DataFileTest, LoadWindowsFile) { "", "2", "3" }, { "1", "2", "" }, { "1", "", "3" }, - { "" } // file ends with a newline, parsing returns a single empty field }; TestFileContents(dataFile, expectedFields); @@ -103,12 +102,77 @@ TEST(DataFileTest, LoadTypicalFile) { "", "2", "3" }, { "1", "2", "" }, { "1", "", "3" }, - { "" } // file ends with a newline, parsing returns a single empty field }; TestFileContents(dataFile, expectedFields); } +TEST(DataFileTest, LoadFileWithNoTrailingNewline) +{ + auto result = LoadDataFile("txtdata\\lf_no_trail.tsv"); + ASSERT_TRUE(result.has_value()) << "Unable to load lf_no_trail.tsv"; + + DataFile &dataFile = result.value(); + EXPECT_EQ(dataFile.size(), 32) << "File size should be reported in code units"; + + std::vector> expectedFields { + { "Test", "Empty", "Values" }, + { "", "2", "3" }, + { "1", "2", "" }, + { "1", "", "3" }, + }; + + TestFileContents(dataFile, expectedFields, GetFieldResult::Status::EndOfRecord, GetFieldResult::Status::NoFinalTerminator); +} + +std::string_view mapError(DataFile::Error error) +{ + switch (error) { + case DataFile::Error::NotFound: + return "not found"; + case DataFile::Error::OpenFailed: + return "cannot open"; + case DataFile::Error::BadRead: + return "cannot read contents"; + default: + return "unexpected error"; + } +} + +TEST(DataFileTest, LoadEmptyFile) +{ + auto result = LoadDataFile("txtdata\\empty.tsv"); + if (!result.has_value()) { + FAIL() << "Unable to load empty.tsv, error: " << mapError(result.error()); + } + + DataFile &dataFile = result.value(); + EXPECT_EQ(dataFile.size(), 0) << "File size should be reported in code units"; + + std::vector> expectedFields { + { "" }, + }; + + TestFileContents(dataFile, expectedFields, GetFieldResult::Status::NoFinalTerminator, GetFieldResult::Status::NoFinalTerminator); +} + +TEST(DataFileTest, LoadEmptyFileWithBOM) +{ + auto result = LoadDataFile("txtdata\\empty_with_utf8_bom.tsv"); + if (!result.has_value()) { + FAIL() << "Unable to load empty_with_utf8_bom.tsv, error: " << mapError(result.error()); + } + + DataFile &dataFile = result.value(); + EXPECT_EQ(dataFile.size(), 0) << "Loading a file containing a UTF8 byte order marker should strip that prefix"; + + std::vector> expectedFields { + { "" }, + }; + + TestFileContents(dataFile, expectedFields, GetFieldResult::Status::NoFinalTerminator, GetFieldResult::Status::NoFinalTerminator); +} + TEST(DataFileTest, LoadUtf8WithBOM) { auto result = LoadDataFile("txtdata\\utf8_bom.tsv"); @@ -122,12 +186,74 @@ TEST(DataFileTest, LoadUtf8WithBOM) { "", "2", "3" }, { "1", "2", "" }, { "1", "", "3" }, - { "" } // file ends with a newline, parsing returns a single empty field }; TestFileContents(dataFile, expectedFields); } +TEST(DataFileTest, ParseInt) +{ + auto result = LoadDataFile("txtdata\\sample.tsv"); + if (!result.has_value()) { + FAIL() << "Unable to load sample.tsv, error: " << mapError(result.error()); + } + + DataFile &dataFile = result.value(); + auto unused = dataFile.parseHeader(nullptr, nullptr, [](std::string_view) -> tl::expected { return tl::unexpected { ColumnDefinition::Error::UnknownColumn }; }); + + EXPECT_TRUE(unused.has_value()) << "Should be able to parse and discard the header from the sample.tsv file"; + + for (DataFileRecord record : dataFile) { + auto fieldIt = record.begin(); + auto end = record.end(); + + ASSERT_NE(fieldIt, end) << "sample.tsv must contain at least one field to use as a test value for strings"; + + // First field is a string that doesn't start with a digit or - character + DataFileField field = *fieldIt; + uint8_t shortVal = 5; + auto parseIntResult = field.parseInt(shortVal); + EXPECT_EQ(parseIntResult, std::errc::invalid_argument) << "Strings are not uint8_t values"; + EXPECT_EQ(shortVal, 5) << "Value is not modified when parsing as uint8_t fails due to non-numeric fields"; + EXPECT_EQ(*field, "Sample") << "Should be able to access the field value as a string after failure"; + ++fieldIt; + + ASSERT_NE(fieldIt, end) << "sample.tsv must contain a second field to use as a test value for small ints"; + + // Second field is a number that fits into an uint8_t value + field = *fieldIt; + shortVal = 5; + parseIntResult = field.parseInt(shortVal); + EXPECT_EQ(parseIntResult, std::errc()) << "Expected " << field << "to fit into a uint8_t variable"; + EXPECT_EQ(shortVal, 145) << "Parsing should give the expected base 10 value"; + EXPECT_EQ(*field, "145") << "Should be able to access the field value as a string even after parsing as an int"; + ++fieldIt; + + ASSERT_NE(fieldIt, end) << "sample.tsv must contain a third field to use as a test value for large ints"; + + // Third field is a number too large for a uint8_t but that fits into an int value + field = *fieldIt; + parseIntResult = field.parseInt(shortVal); + EXPECT_EQ(parseIntResult, std::errc::result_out_of_range) << "A value too large to fit into a uint8_t variable should report an error"; + EXPECT_EQ(shortVal, 145) << "Value is not modified when parsing as uint8_t fails due to out of range value"; + int longVal = 42; + parseIntResult = field.parseInt(longVal); + EXPECT_EQ(parseIntResult, std::errc()) << "Expected " << field << "to fit into an int variable"; + EXPECT_EQ(longVal, 70322) << "Value is expected to be parsed into a larger type after an out of range failure"; + EXPECT_EQ(*field, "70322") << "Should be able to access the field value as a string after parsing as an int"; + ++fieldIt; + + ASSERT_NE(fieldIt, end) << "sample.tsv must contain a fourth field to use as a test value for fields that look like ints"; + + // Fourth field is not an integer, but a value that starts with one or more digits that fit into an uint8_t value + field = *fieldIt; + parseIntResult = field.parseInt(shortVal); + EXPECT_EQ(parseIntResult, std::errc()) << "Expected " << field << "to fit into a uint8_t variable (even though it's not really an int)"; + EXPECT_EQ(shortVal, 6) << "Value is loaded as expected until the first non-digit character"; + EXPECT_EQ(*field, "6.34") << "Should be able to access the field value as a string after failure"; + } +} + TEST(DataFileTest, IterateOverRecords) { auto result = LoadDataFile("txtdata\\lf.tsv"); @@ -140,16 +266,51 @@ TEST(DataFileTest, IterateOverRecords) { "", "2", "3" }, { "1", "2", "" }, { "1", "", "3" }, - { "" } // file ends with a newline, parsing returns a single empty field }; unsigned row = 0; - for (FieldsInRecordRange record : dataFile.records()) { + for (DataFileRecord record : dataFile) { + ASSERT_LT(row, expectedFields.size()) << "Too many records"; + unsigned col = 0; + for (DataFileField field : record) { + if (col < expectedFields[row].size()) + EXPECT_EQ(*field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col; + else + ADD_FAILURE() << "Extra value '" << field << "' in record " << row << " at field " << col; + col++; + } + EXPECT_GE(col, expectedFields[row].size()) << "Parsing returned fewer fields than expected in record " << row; + row++; + } + EXPECT_EQ(row, expectedFields.size()) << "Parsing returned fewer records than expected"; +} + +TEST(DataFileTest, ParseHeaderThenIterateOverRecords) +{ + auto result = LoadDataFile("txtdata\\lf.tsv"); + ASSERT_TRUE(result.has_value()) << "Unable to load lf.tsv"; + + DataFile &dataFile = result.value(); + + std::vector> expectedFields { + { "", "2", "3" }, + { "1", "2", "" }, + { "1", "", "3" }, + }; + + auto parseHeaderResult = dataFile.parseHeader(nullptr, nullptr, [](std::string_view) -> tl::expected { return tl::unexpected { ColumnDefinition::Error::UnknownColumn }; }); + EXPECT_TRUE(parseHeaderResult.has_value()) << "Expected to be able to parse and discard the header record"; + + unsigned row = 0; + for (DataFileRecord record : dataFile) { ASSERT_LT(row, expectedFields.size()) << "Too many records"; + EXPECT_EQ(row + 1, record.row()) << "DataFileRecord (through iterator) should report a 1-indexed row after parsing the header record"; unsigned col = 0; - for (std::string_view field : record) { + for (DataFileField field : record) { + EXPECT_EQ(record.row(), field.row()) << "Field should report the same row as the current DataFileRecord"; + EXPECT_EQ(col, field.column()) << "Field (through iterator) should report a 0-indexed column"; if (col < expectedFields[row].size()) - EXPECT_EQ(field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col; + EXPECT_EQ(*field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col; else ADD_FAILURE() << "Extra value '" << field << "' in record " << row << " at field " << col; col++; @@ -167,19 +328,25 @@ TEST(DataFileTest, DiscardAllAfterFirstField) DataFile &dataFile = loadDataResult.value(); - std::array expectedFields { "Test", "", "1", "1", "" }; + std::vector> expectedFields { + { "Test", "Empty", "Values" }, + { "", "2", "3" }, + { "1", "2", "" }, + { "1", "", "3" }, + }; - GetFieldResult result { dataFile.begin() }; + GetFieldResult result { dataFile.data() }; + const char *end = dataFile.data() + dataFile.size(); unsigned row = 0; do { ASSERT_LT(row, expectedFields.size()) << "Too many records"; - result = GetNextField(result.next, dataFile.end()); - EXPECT_EQ(result.value, expectedFields[row]) << "Unexpected first value at record " << row; + result = GetNextField(result.next, end); + EXPECT_EQ(result.value, expectedFields[row][0]) << "Unexpected first value at record " << row; if (result.endOfRecord() && !result.endOfFile()) ADD_FAILURE() << "Parsing returned fewer fields than expected in record " << row; else - result = DiscardRemainingFields(result.next, dataFile.end()); + result = DiscardRemainingFields(result.next, end); row++; } while (!result.endOfFile()); @@ -193,13 +360,18 @@ TEST(DataFileTest, DiscardAllAfterFirstFieldIterator) DataFile &dataFile = result.value(); - std::array expectedFields { "Test", "", "1", "1", "" }; + std::vector> expectedFields { + { "Test", "Empty", "Values" }, + { "", "2", "3" }, + { "1", "2", "" }, + { "1", "", "3" }, + }; unsigned row = 0; - for (FieldsInRecordRange record : dataFile.records()) { + for (DataFileRecord record : dataFile) { ASSERT_LT(row, expectedFields.size()) << "Too many records"; - for (std::string_view field : record) { - EXPECT_EQ(field, expectedFields[row]) << "Unexpected first value at record " << row; + for (DataFileField field : record) { + EXPECT_EQ(*field, expectedFields[row][0]) << "Field with value " << field << " does not match the expected value for record " << row; break; } row++; @@ -213,23 +385,29 @@ TEST(DataFileTest, DiscardAllUpToLastField) DataFile &dataFile = loadDataResult.value(); - std::array expectedFields { "Values", "3", "", "3", "" }; + std::vector> expectedFields { + { "Test", "Empty", "Values" }, + { "", "2", "3" }, + { "1", "2", "" }, + { "1", "", "3" }, + }; - GetFieldResult result { dataFile.begin() }; + GetFieldResult result { dataFile.data() }; + const char *const end = dataFile.data() + dataFile.size(); unsigned row = 0; do { ASSERT_LT(row, expectedFields.size()) << "Too many records"; - result = DiscardMultipleFields(result.next, dataFile.end(), 2); - if (row < expectedFields.size() - 1) { + result = DiscardMultipleFields(result.next, end, 2); + if (row < expectedFields.size()) { EXPECT_FALSE(result.endOfRecord()) << "Parsing returned fewer fields than expected in record " << row; if (!result.endOfRecord()) - result = GetNextField(result.next, dataFile.end()); + result = GetNextField(result.next, end); } - EXPECT_EQ(result.value, expectedFields[row]) << "Unexpected last value at record " << row; + EXPECT_EQ(result.value, expectedFields[row][2]) << "Unexpected last value at record " << row; if (!result.endOfRecord()) { ADD_FAILURE() << "Parsing returned fewer fields than expected in record " << row; - result = DiscardRemainingFields(result.next, dataFile.end()); + result = DiscardRemainingFields(result.next, end); } row++; } while (!result.endOfFile()); @@ -244,32 +422,34 @@ TEST(DataFileTest, SkipFieldIterator) DataFile &dataFile = loadDataResult.value(); - // we don't actually test the last value as incrementing the fields-in-record iterator past - // the last field invalidates the value, we can't recover the same way the procedural interface does. - std::array expectedFields { "Values", "3", "", "3", "" }; + std::vector> expectedFields { + { "Test", "Empty", "Values" }, + { "", "2", "3" }, + { "1", "2", "" }, + { "1", "", "3" }, + }; - RecordsRange recordsRange = dataFile.records(); - auto record = recordsRange.begin(); - auto endRecord = recordsRange.end(); unsigned row = 0; - while (record != endRecord) { + for (DataFileRecord record : dataFile) { ASSERT_LT(row, expectedFields.size()) << "Too many records"; - FieldsInRecordRange fieldRange = *record; - auto field = fieldRange.begin(); - auto endField = fieldRange.end(); - field += 2; - if (row < expectedFields.size() - 1) { - if (field == endField) { + auto fieldIt = record.begin(); + auto endField = record.end(); + fieldIt += 0; + EXPECT_EQ((*fieldIt).value(), expectedFields[row][0]) << "Advancing a field iterator by 0 should not discard any values"; + + fieldIt += 2; + if (row < expectedFields.size()) { + if (fieldIt == endField) { ADD_FAILURE() << "Parsing returned fewer fields than expected in record " << row; } else { - EXPECT_EQ(*field, expectedFields[row]) << "Unexpected last value at record " << row; - ++field; + EXPECT_EQ((*fieldIt).value(), expectedFields[row][2]) << "Unexpected last value at record " << row; + ++fieldIt; } } - EXPECT_EQ(field, endField) << "Parsing returned more fields than expected in record " << row; + EXPECT_EQ(fieldIt, endField) << "Parsing returned more fields than expected in record " << row; + EXPECT_EQ(fieldIt.column(), expectedFields[row].size() - 1) << "Field iterator should report the index of the last column"; ++row; - ++record; } EXPECT_EQ(row, expectedFields.size()) << "Parsing returned fewer records than expected"; @@ -283,30 +463,31 @@ TEST(DataFileTest, SkipRowIterator) DataFile &dataFile = loadDataResult.value(); std::vector> expectedFields { - // skipping the first two lines + { "Test", "Empty", "Values" }, + { "", "2", "3" }, { "1", "2", "" }, { "1", "", "3" }, - { "" } // file ends with a newline, parsing returns a single empty field }; - RecordsRange recordsRange = dataFile.records(); - auto record = recordsRange.begin(); - auto endRecord = recordsRange.end(); - record += 2; - unsigned row = 0; - while (record != endRecord) { + auto recordIt = dataFile.begin(); + auto endRecord = dataFile.end(); + recordIt += 0; + EXPECT_EQ((*(*recordIt).begin()).value(), expectedFields[0][0]) << "Advancing a record iterator by 0 should not discard any values"; + recordIt += 2; + unsigned row = 2; + while (recordIt != endRecord) { ASSERT_LT(row, expectedFields.size()) << "Too many records"; unsigned col = 0; - for (std::string_view field : *record) { + for (DataFileField field : *recordIt) { if (col < expectedFields[row].size()) - EXPECT_EQ(field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col; + EXPECT_EQ(*field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col; else ADD_FAILURE() << "Extra value '" << field << "' in record " << row << " at field " << col; col++; } EXPECT_GE(col, expectedFields[row].size()) << "Parsing returned fewer fields than expected in record " << row; ++row; - ++record; + ++recordIt; } EXPECT_EQ(row, expectedFields.size()) << "Parsing returned fewer records than expected"; diff --git a/test/fixtures/txtdata/empty.tsv b/test/fixtures/txtdata/empty.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/test/fixtures/txtdata/empty_with_utf8_bom.tsv b/test/fixtures/txtdata/empty_with_utf8_bom.tsv new file mode 100644 index 000000000..5f282702b --- /dev/null +++ b/test/fixtures/txtdata/empty_with_utf8_bom.tsv @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/test/fixtures/txtdata/lf_no_trail.tsv b/test/fixtures/txtdata/lf_no_trail.tsv new file mode 100644 index 000000000..303e16a0c --- /dev/null +++ b/test/fixtures/txtdata/lf_no_trail.tsv @@ -0,0 +1,4 @@ +Test Empty Values + 2 3 +1 2 +1 3 \ No newline at end of file diff --git a/test/fixtures/txtdata/sample.tsv b/test/fixtures/txtdata/sample.tsv new file mode 100644 index 000000000..d2f308ef7 --- /dev/null +++ b/test/fixtures/txtdata/sample.tsv @@ -0,0 +1,2 @@ +String Byte Int Float +Sample 145 70322 6.34