Browse Source

Refactor iterator parsing to allow more reuse

pull/6529/head
ephphatha 3 years ago committed by Anders Jenbo
parent
commit
5f6e5ae9fb
  1. 60
      Packaging/resources/assets/txtdata/Readme.md
  2. 101
      Source/data/file.cpp
  3. 98
      Source/data/file.hpp
  4. 473
      Source/data/iterators.hpp
  5. 41
      Source/data/parser.cpp
  6. 66
      Source/data/parser.hpp
  7. 176
      Source/playerdat.cpp
  8. 4
      test/Fixtures.cmake
  9. 291
      test/data_file_test.cpp
  10. 0
      test/fixtures/txtdata/empty.tsv
  11. 1
      test/fixtures/txtdata/empty_with_utf8_bom.tsv
  12. 4
      test/fixtures/txtdata/lf_no_trail.tsv
  13. 2
      test/fixtures/txtdata/sample.tsv

60
Packaging/resources/assets/txtdata/Readme.md

@ -21,28 +21,49 @@ A formal description of the format using
The first record in a file SHOULD be used as a header
Implementations are free to treat records however they want, using
multiple records as headers or none at all.
Files SHOULD contain at least one record.
An empty file (zero-length or containing only a UTF8 BOM sequence) is
valid and is typically interpreted as a header for a single unnamed column
with no records.
All files contain at least one record. The last record in a file SHOULD end
with a trailing newline, however some spreadsheet applications do not
output trailing newlines so we allow flexibility here. This means that an
empty file (zero-length or containing only a UTF8 BOM sequence) is valid
and is typically interpreted as a header for a single unnamed column with
no records.
Records SHOULD contain the same number of fields.
Files SHOULD end with a trailing newline.
Note that while a trailing newline is treated as if the file has a final
record containing a single empty field, in practice these records SHOULD
be discarded. This is mainly to handle the typical output from spreadsheet
applications like Excel.
*/
DXTxtFile ::= utf8bom? header ( recordsep, record )*
DXTxtFile ::= utf8bom? ( singleRecord | record+ finalRecord? )
utf8bom ::= #xEF #xBB #xBF
utf8bom ::= #xEF #xBB #xBF
header ::= record
/*
if parsing reaches EOF and the file has no record terminators then the file
MUST be treated as containing a single record with no terminator
*/
singleRecord ::= fields
/*
if for some reason you want to end a file with a record containing a single
empty field then the record MUST end with a valid terminator
*/
record ::= fields recordterm
/*
this means that the terminator is only truly optional for files where the
final record contains a single field with at least one character, or at least
two fields (as there will then be at least one field separator even if both
fields are zero-length)
*/
finalRecord ::= nonEmptyField | field fieldsep fields
/* an empty line is treated as a single empty field */
record ::= field ( fieldsep field )*
fields ::= field ( fieldsep field )*
/* fields MAY be zero length */
field ::= character*
field ::= character*
/*
unless the final record only consists of a single field, in which case it
MUST contain at least one character
*/
nonEmptyField ::= character+
/*
Any Char (see https://www.w3.org/TR/xml/#NT-Char) except Tab ("\t", 0x09),
@ -50,13 +71,16 @@ field ::= character*
field value. For maximum portability characters in the discouraged ranges
described in the linked section SHOULD NOT be used
*/
character ::= [^#x9#xA#xD]
character ::= [^#x9#xA#xD]
/* fields MUST be separated by tabs */
fieldsep ::= #x9
fieldsep ::= #x9
/* records MUST be separated by line feeds, a cr/lf pair MAY be used */
recordsep ::= #xD? #xA
/*
records (other than the final record or the only record in a single-record
file) MUST be terminated by line feeds, a cr/lf pair MAY be used
*/
recordterm ::= #xD? #xA
```
## File Descriptions

101
Source/data/file.cpp

@ -1,6 +1,9 @@
#include "file.hpp"
#include <fmt/format.h>
#include "engine/assets.hpp"
#include "utils/language.h"
namespace devilution {
tl::expected<DataFile, DataFile::Error> DataFile::load(std::string_view path)
@ -13,9 +16,103 @@ tl::expected<DataFile, DataFile::Error> DataFile::load(std::string_view path)
std::unique_ptr<char[]> data { new char[size] };
{
AssetHandle handle = OpenAsset(std::move(ref));
if (!handle.ok() || !handle.read(data.get(), size))
return tl::unexpected { Error::ReadError };
if (!handle.ok())
return tl::unexpected { Error::OpenFailed };
if (size > 0 && !handle.read(data.get(), size))
return tl::unexpected { Error::BadRead };
}
return DataFile { std::move(data), size };
}
void DataFile::reportFatalError(Error code, std::string_view fileName)
{
switch (code) {
case Error::NotFound:
case Error::OpenFailed:
case Error::BadRead:
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when a data file is missing or corrupt. Arguments are {file name} */
"Unable to load data from file {0}")),
fileName));
case Error::NoContent:
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when a data file is empty or only contains the header row. Arguments are {file name} */
"{0} is incomplete, please check the file contents.")),
fileName));
case Error::NotEnoughColumns:
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when a data file doesn't contain the expected columns. Arguments are {file name} */
"Your {0} file doesn't have the expected columns, please make sure it matches the documented format.")),
fileName));
}
}
void DataFile::reportFatalFieldError(std::errc code, std::string_view fileName, std::string_view fieldName, const DataFileField &field)
{
switch (code) {
case std::errc::invalid_argument:
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when parsing a data file and a text value is encountered when a number is expected. Arguments are {found value}, {column heading}, {file name}, {row/record number}, {column/field number} */
"Non-numeric value {0} for {1} in {2} at row {3} and column {4}")),
field.currentValue(), fieldName, fileName, field.row(), field.column()));
case std::errc::result_out_of_range:
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when parsing a data file and we find a number larger than expected. Arguments are {found value}, {column heading}, {file name}, {row/record number}, {column/field number} */
"Out of range value {0} for {1} in {2} at row {3} and column {4}")),
field.currentValue(), fieldName, fileName, field.row(), field.column()));
default:
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Fallback error message when an error occurs while parsing a data file and we can't determine the cause. Arguments are {file name}, {row/record number}, {column/field number} */
"Unexpected error while reading {0} at row {1} and column {2}")),
fileName, field.row(), field.column()));
}
}
tl::expected<void, DataFile::Error> DataFile::parseHeader(ColumnDefinition *begin, ColumnDefinition *end, tl::function_ref<tl::expected<uint8_t, ColumnDefinition::Error>(std::string_view)> mapper)
{
std::bitset<std::numeric_limits<uint8_t>::max()> seenColumns;
unsigned lastColumn = 0;
RecordIterator firstRecord { data(), data() + size(), false };
for (DataFileField field : *firstRecord) {
if (begin == end) {
// All key columns have been identified
break;
}
auto mapResult = mapper(*field);
if (!mapResult.has_value()) {
// not a key column
continue;
}
uint8_t columnType = mapResult.value();
if (seenColumns.test(columnType)) {
// Repeated column? unusual, maybe this should be an error
continue;
}
seenColumns.set(columnType);
unsigned skipColumns = 0;
if (field.column() > lastColumn)
skipColumns = field.column() - lastColumn - 1;
lastColumn = field.column();
*begin = { columnType, skipColumns };
++begin;
}
// Incrementing the iterator causes it to read to the end of the record in case we broke early (maybe there were extra columns)
++firstRecord;
if (firstRecord == this->end()) {
return tl::unexpected { Error::NoContent };
}
body_ = firstRecord.data();
if (begin != end) {
return tl::unexpected { Error::NotEnoughColumns };
}
return {};
}
} // namespace devilution

98
Source/data/file.hpp

@ -4,17 +4,59 @@
#include <string_view>
#include <expected.hpp>
#include <function_ref.hpp>
#include "iterators.hpp"
namespace devilution {
struct ColumnDefinition {
uint8_t type;
enum class Error {
UnknownColumn
};
// The number of fields between this column and the last one identified as important (or from start of the record if this is the first column we care about)
unsigned skipLength = 0;
ColumnDefinition()
: type(std::numeric_limits<uint8_t>::max())
{
}
ColumnDefinition(unsigned type)
: type(type)
{
}
ColumnDefinition(unsigned type, unsigned skipLength)
: type(type)
, skipLength(skipLength)
{
}
bool operator==(const ColumnDefinition &other) const
{
return type == other.type && skipLength == other.skipLength;
}
template <typename T>
explicit operator T() const
{
return static_cast<T>(type);
}
};
/**
* @brief Container for a tab-delimited file following the TSV-like format described in txtdata/Readme.md
*/
class DataFile {
std::unique_ptr<char[]> data_;
std::unique_ptr<const char[]> data_;
std::string_view content_;
const char *body_;
DataFile() = delete;
/**
@ -22,19 +64,24 @@ class DataFile {
* @param data pointer to the raw data backing the view (this container will take ownership to ensure the lifetime of the view)
* @param size total number of bytes/code units including the BOM if present
*/
DataFile(std::unique_ptr<char[]> &&data, size_t size)
DataFile(std::unique_ptr<const char[]> &&data, size_t size)
: data_(std::move(data))
, content_(data_.get(), size)
{
constexpr std::string_view utf8BOM = "\xef\xbb\xbf";
if (this->content_.starts_with(utf8BOM))
this->content_.remove_prefix(utf8BOM.size());
body_ = this->content_.data();
}
public:
enum class Error {
NotFound,
ReadError
OpenFailed,
BadRead,
NoContent,
NotEnoughColumns
};
/**
@ -46,19 +93,52 @@ public:
*/
static tl::expected<DataFile, Error> load(std::string_view path);
[[nodiscard]] RecordsRange records() const
static void reportFatalError(Error code, std::string_view fileName);
static void reportFatalFieldError(std::errc code, std::string_view fileName, std::string_view fieldName, const DataFileField &field);
void resetHeader()
{
return content_;
body_ = content_.data();
}
[[nodiscard]] const char *begin() const
/**
* @brief Attempts to parse the first row/record in the file, populating the range defined by [begin, end) using the provided mapping function
*
* This method will also set an internal marker so that future uses of the begin iterator skip the header line.
* @param begin Start of the destination range
* @param end End of the destination range
* @param mapper Function that maps from a string_view to a unique numeric identifier for the column
* @return If the file ends after the header or not enough columns were defined this function returns an error code describing the failure.
*/
[[nodiscard]] tl::expected<void, DataFile::Error> parseHeader(ColumnDefinition *begin, ColumnDefinition *end, tl::function_ref<tl::expected<uint8_t, ColumnDefinition::Error>(std::string_view)> mapper);
/**
* @brief Templated version of parseHeader(uint8_t) to allow using directly with enum definitions of columns
* @tparam T An enum or any type that defines operator uint8_t()
* @param begin Start of the destination range
* @param end End of the destination range
* @param typedMapper Function that maps from a string_view to a unique T value
* @return A void success result or an error code as described above
*/
template <typename T>
[[nodiscard]] tl::expected<void, DataFile::Error> parseHeader(ColumnDefinition *begin, ColumnDefinition *end, std::function<tl::expected<T, ColumnDefinition::Error>(std::string_view)> typedMapper)
{
return content_.data();
return parseHeader(begin, end, [typedMapper](std::string_view label) { return typedMapper(label).transform([](T value) { return static_cast<uint8_t>(value); }); });
}
[[nodiscard]] RecordIterator begin() const
{
return { body_, data() + size(), body_ != data() };
}
[[nodiscard]] const char *end() const
[[nodiscard]] RecordIterator end() const
{
return content_.data() + content_.size();
return {};
}
[[nodiscard]] const char *data() const
{
return content_.data();
}
[[nodiscard]] size_t size() const

473
Source/data/iterators.hpp

@ -1,234 +1,365 @@
#pragma once
#include <charconv>
#include <ostream>
#include <expected.hpp>
#include "parser.hpp"
namespace devilution {
class FieldsInRecordRange {
class DataFileField {
GetFieldResult *state_;
const char *const end_;
const char *end_;
unsigned row_;
unsigned column_;
public:
FieldsInRecordRange(GetFieldResult *state, const char *end)
DataFileField(GetFieldResult *state, const char *end, unsigned row, unsigned column)
: state_(state)
, end_(end)
, row_(row)
, column_(column)
{
}
class Iterator {
GetFieldResult *state_;
const char *const end_;
/**
* @brief Returns a view of the current field
*
* This method scans the current field if this is the first value access since the last
* advance. If you expect the field to contain a numeric value then calling parseInt first
* is more efficient, but calling the methods in either order is supported.
* @return The current field value (may be an empty string) or a zero length string_view
*/
[[nodiscard]] std::string_view value()
{
if (state_->status == GetFieldResult::Status::ReadyToRead) {
*state_ = GetNextField(state_->next, end_);
}
return state_->value;
}
public:
using iterator_category = std::input_iterator_tag;
using value_type = std::string_view;
/**
* Convenience function to let DataFileField instances be used like other single-value STL containers
*/
[[nodiscard]] std::string_view operator*()
{
return this->value();
}
Iterator()
: state_(nullptr)
, end_(nullptr)
{
/**
* @brief Attempts to parse the current field as a numeric value using std::from_chars
*
* You can freely interleave this method with calls to operator*. If this is the first value
* access since the last advance this will scan the current field and store it for later
* use with operator* or repeated calls to parseInt (even with different types).
* @tparam T an Integral type supported by std::from_chars
* @param destination value to store the result of successful parsing
* @return the error code from std::from_chars
*/
template <typename T>
[[nodiscard]] std::errc parseInt(T &destination)
{
std::from_chars_result result {};
if (state_->status == GetFieldResult::Status::ReadyToRead) {
const char *begin = state_->next;
result = std::from_chars(begin, end_, destination);
if (result.ec != std::errc::invalid_argument) {
// from_chars was able to consume at least one character, consume the rest of the field
*state_ = GetNextField(result.ptr, end_);
// and prepend what was already parsed
state_->value = { begin, (state_->value.data() - begin) + state_->value.size() };
}
} else {
result = std::from_chars(state_->value.data(), end_, destination);
}
return result.ec;
}
Iterator(GetFieldResult *state, const char *end)
: state_(state)
, end_(end)
{
state_->status = GetFieldResult::Status::ReadyToRead;
template <typename T>
[[nodiscard]] tl::expected<T, std::errc> asInt()
{
T value = 0;
auto parseIntResult = parseInt(value);
if (parseIntResult == std::errc()) {
return value;
}
return tl::unexpected { parseIntResult };
}
[[nodiscard]] bool operator==(const Iterator &rhs) const
{
if (state_ == nullptr && rhs.state_ == nullptr)
return true;
/**
* Returns the current row number
*/
[[nodiscard]] unsigned row() const
{
return row_;
}
return state_ != nullptr && rhs.state_ != nullptr && state_->next == rhs.state_->next;
}
/**
* Returns the current column/field number (from the start of the row/record)
*/
[[nodiscard]] unsigned column() const
{
return column_;
}
[[nodiscard]] bool operator!=(const Iterator &rhs) const
{
return !(*this == rhs);
}
/**
* Allows accessing the value of this field in a const context
*
* This requires an actual non-const value access to happen first before it returns
* any useful results, intended for use in error reporting (or test output).
*/
[[nodiscard]] std::string_view currentValue() const
{
return state_->value;
}
};
/**
* Advances to the next field in the current record
*/
Iterator &operator++()
{
return *this += 1;
}
/**
* @brief Show the field value along with the row/column number (mainly used in test failure messages)
* @param stream output stream, expected to have overloads for unsigned, std::string_view, and char*
* @param field Object to display
* @return the stream, to allow chaining
*/
inline std::ostream &operator<<(std::ostream &stream, const DataFileField &field)
{
return stream << "\"" << field.currentValue() << "\" (at row " << field.row() << ", column " << field.column() << ")";
}
class FieldIterator {
GetFieldResult *state_;
const char *const end_;
const unsigned row_;
unsigned column_ = 0;
/**
* @brief Advances by the specified number of fields
*
* if a non-zero increment is provided and advancing the iterator causes it to reach the end
* of the record the iterator is invalidated. It will compare equal to an end iterator and
* cannot be used for value access or any further parsing
* @param increment how many fields to advance (can be 0)
* @return self-reference
*/
Iterator &operator+=(unsigned increment)
{
if (state_->status == GetFieldResult::Status::ReadyToRead)
*state_ = DiscardMultipleFields(state_->next, end_, increment);
if (state_->endOfRecord()) {
state_ = nullptr;
} else {
// We use Status::ReadyToRead as a marker so we only read the next value on the next call to operator*
// this allows consumers to read from the input stream without using operator* if they want
state_->status = GetFieldResult::Status::ReadyToRead;
}
return *this;
}
public:
using iterator_category = std::input_iterator_tag;
using value_type = DataFileField;
/**
* @brief Returns a view of the current field
*
* This method scans the current field if this is the first value access since the last
* advance. You can repeatedly call operator* safely but you must not try to use parseInt
* after calling this method. If you calling parseInt and get an invalid_argument result
* back you can use this method to get the actual field value, but if you received any
* other result then parseInt has consumed the field and this method is no longer reliable.
* @return The current field value (may be an empty string) or a zero length string_view
*/
[[nodiscard]] value_type operator*()
{
if (state_->status == GetFieldResult::Status::ReadyToRead) {
*state_ = GetNextField(state_->next, end_);
}
return state_->value;
}
FieldIterator()
: state_(nullptr)
, end_(nullptr)
, row_(0)
{
}
/**
* @brief Attempts to parse a field as a numeric value using std::from_chars
*
* This method is NOT repeatable. In addition to the behaviour of std::from_chars please
* heed the following warnings. If the field contains a value larger than will fit into the
* given type parsing will fail with std::errc::out_of_range and the field will be
* discarded. If a field starts with some digits then is followed by other characters the
* remainder will also be discarded. You can only get a useful value out of operator* if
* the result code is std::errc::invalid_argument.
* @tparam T an Integral type supported by std::from_chars
* @param destination value to store the result of successful parsing
* @return the error code from std::from_chars
*/
template <typename T>
[[nodiscard]] std::errc parseInt(T &destination)
{
auto result = std::from_chars(state_->next, end_, destination);
if (result.ec != std::errc::invalid_argument) {
// from_chars was able to consume at least one character, so discard the rest of the field
*state_ = DiscardField(result.ptr, end_);
}
return result.ec;
FieldIterator(GetFieldResult *state, const char *end, unsigned row)
: state_(state)
, end_(end)
, row_(row)
{
state_->status = GetFieldResult::Status::ReadyToRead;
}
[[nodiscard]] bool operator==(const FieldIterator &rhs) const
{
if (state_ == nullptr && rhs.state_ == nullptr)
return true;
return state_ != nullptr && rhs.state_ != nullptr && state_->next == rhs.state_->next;
}
[[nodiscard]] bool operator!=(const FieldIterator &rhs) const
{
return !(*this == rhs);
}
/**
* Advances to the next field in the current record
*/
FieldIterator &operator++()
{
return *this += 1;
}
/**
* @brief Advances by the specified number of fields
*
* if a non-zero increment is provided and advancing the iterator causes it to reach the end
* of the record the iterator is invalidated. It will compare equal to an end iterator and
* cannot be used for value access or any further parsing
* @param increment how many fields to advance (can be 0)
* @return self-reference
*/
FieldIterator &operator+=(unsigned increment)
{
if (increment == 0)
return *this;
if (state_->status == GetFieldResult::Status::ReadyToRead) {
// We never read the value and no longer need it, discard it so that we end up
// advancing past the field delimiter (as if a value access had happened)
*state_ = DiscardField(state_->next, end_);
}
/**
* @brief Checks if this field is the last field in a file, not just in the record
*
* If you call this method before calling parseInt or operator* then this will trigger a
* read, meaning you can no longer use parseInt to try extract a numeric value. You
* probably want to use this after calling parseInt.
* @return true if this field is the last field in the file/RecordsRange
*/
[[nodiscard]] bool endOfFile() const
{
if (state_->status == GetFieldResult::Status::ReadyToRead) {
*state_ = GetNextField(state_->next, end_);
}
return state_->endOfFile();
if (state_->endOfRecord()) {
state_ = nullptr;
} else {
unsigned fieldsSkipped = 0;
// By this point we've already advanced past the end of this field (either because the
// last value access found the end of the field by necessity or we discarded it a few
// lines up), so we only need to advance further if an increment greater than 1 was
// provided.
*state_ = DiscardMultipleFields(state_->next, end_, increment - 1, &fieldsSkipped);
// As we've consumed the current field by this point we need to increment the internal
// column counter one extra time so we have an accurate value.
column_ += fieldsSkipped + 1;
// We use Status::ReadyToRead as a marker so we only read the next value on the next
// value access, this allows consumers to choose the most efficient method (e.g. if
// they want the value as an int) or even repeated advances without using a value.
state_->status = GetFieldResult::Status::ReadyToRead;
}
};
return *this;
}
[[nodiscard]] Iterator begin() const
/**
* @brief Returns a view of the current field
*
* The returned value is a thin wrapper over the current state of this iterator (or last
* successful read if incrementing this iterator would result in it reaching the end state).
*/
[[nodiscard]] value_type operator*()
{
return { state_, end_ };
return { state_, end_, row_, column_ };
}
[[nodiscard]] Iterator end() const
/**
* @brief Returns the current row number
*/
[[nodiscard]] unsigned row() const
{
return {};
return row_;
}
/**
* @brief Returns the current column/field number (from the start of the row/record)
*/
[[nodiscard]] unsigned column() const
{
return column_;
}
};
class RecordsRange {
const char *const begin_;
class DataFileRecord {
GetFieldResult *state_;
const char *const end_;
const unsigned row_;
public:
RecordsRange(std::string_view characters)
: begin_(characters.data())
, end_(characters.data() + characters.size())
DataFileRecord(GetFieldResult *state, const char *end, unsigned row)
: state_(state)
, end_(end)
, row_(row)
{
}
class Iterator {
GetFieldResult state_;
const char *const end_;
[[nodiscard]] FieldIterator begin()
{
return { state_, end_, row_ };
}
public:
using iterator_category = std::forward_iterator_tag;
using value_type = FieldsInRecordRange;
[[nodiscard]] FieldIterator end() const
{
return {};
}
Iterator()
: state_(nullptr, GetFieldResult::Status::EndOfFile)
, end_(nullptr)
{
}
[[nodiscard]] unsigned row() const
{
return row_;
}
};
Iterator(const char *begin, const char *end)
: state_(begin)
, end_(end)
{
}
class RecordIterator {
GetFieldResult state_;
const char *const end_;
unsigned row_ = 0;
[[nodiscard]] bool operator==(const Iterator &rhs) const
{
return state_.next == rhs.state_.next;
}
public:
using iterator_category = std::forward_iterator_tag;
using value_type = DataFileRecord;
[[nodiscard]] bool operator!=(const Iterator &rhs) const
{
return !(*this == rhs);
}
RecordIterator()
: state_(nullptr, GetFieldResult::Status::EndOfFile)
, end_(nullptr)
{
}
Iterator &operator++()
{
return *this += 1;
}
RecordIterator(const char *begin, const char *end, bool skippedHeader)
: state_(begin)
, end_(end)
, row_(skippedHeader ? 1 : 0)
{
}
Iterator &operator+=(unsigned increment)
{
if (!state_.endOfRecord())
state_ = DiscardRemainingFields(state_.next, end_);
if (state_.endOfFile()) {
state_.next = nullptr;
} else {
if (increment > 0)
state_ = DiscardMultipleRecords(state_.next, end_, increment - 1);
// We use Status::ReadyToRead as a marker in case the Field iterator is never
// used, so the next call to operator+= will advance past the current record
state_.status = GetFieldResult::Status::ReadyToRead;
}
[[nodiscard]] bool operator==(const RecordIterator &rhs) const
{
return state_.next == rhs.state_.next;
}
[[nodiscard]] bool operator!=(const RecordIterator &rhs) const
{
return !(*this == rhs);
}
RecordIterator &operator++()
{
return *this += 1;
}
RecordIterator &operator+=(unsigned increment)
{
if (increment == 0)
return *this;
if (!state_.endOfRecord()) {
// The field iterator either hasn't been used or hasn't consumed the entire record
state_ = DiscardRemainingFields(state_.next, end_);
}
[[nodiscard]] value_type operator*()
{
return { &state_, end_ };
if (state_.endOfFile()) {
state_.next = nullptr;
} else {
unsigned recordsSkipped = 0;
// By this point we've already advanced past the end of this record (either because the
// last value access found the end of the record by necessity or we discarded any
// leftovers a few lines up), so we only need to advance further if an increment
// greater than 1 was provided.
state_ = DiscardMultipleRecords(state_.next, end_, increment - 1, &recordsSkipped);
// As we've consumed the current record by this point we need to increment the internal
// row counter one extra time so we have an accurate value.
row_ += recordsSkipped + 1;
// We use Status::ReadyToRead as a marker in case the DataFileField iterator is never
// used, so the next call to operator+= will advance past the current record
state_.status = GetFieldResult::Status::ReadyToRead;
}
};
return *this;
}
[[nodiscard]] Iterator begin() const
[[nodiscard]] DataFileRecord operator*()
{
return { begin_, end_ };
return { &state_, end_, row_ };
}
[[nodiscard]] Iterator end() const
/**
* @brief Exposes the current location of this input iterator.
*
* This is only expected to be used internally so the DataFile instance knows where the header
* ends and the body begins. You probably don't want to use this directly.
*/
[[nodiscard]] const char *data() const
{
return {};
return state_.next;
}
/**
* @brief Returns the current row/record number (from the start of the file)
*
* The header row is always considered row 0, however if you've called DataFile.parseHeader()
* before calling DataFile.begin() then you'll get row 1 as the first record of the range.
*/
[[nodiscard]] unsigned row() const
{
return row_;
}
};
} // namespace devilution

41
Source/data/parser.cpp

@ -1,10 +1,10 @@
#include "parser.hpp"
namespace devilution {
GetFieldResult HandleRecordSeparator(const char *begin, const char *end)
GetFieldResult HandleRecordTerminator(const char *begin, const char *end)
{
if (begin == end) {
return { end, GetFieldResult::Status::EndOfFile };
return { end, GetFieldResult::Status::NoFinalTerminator };
}
if (*begin == '\r') {
@ -15,36 +15,49 @@ GetFieldResult HandleRecordSeparator(const char *begin, const char *end)
// carriage returns should be followed by a newline, so let's let the following checks handle it
}
if (*begin == '\n') {
return { begin + 1, GetFieldResult::Status::EndOfRecord };
++begin;
if (begin == end) {
return { end, GetFieldResult::Status::EndOfFile };
}
return { begin, GetFieldResult::Status::EndOfRecord };
}
return { begin, GetFieldResult::Status::BadRecordSeparator };
return { begin, GetFieldResult::Status::BadRecordTerminator };
}
GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength)
GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength, unsigned *fieldsSkipped)
{
GetFieldResult result { begin };
while (skipLength > 0) {
unsigned skipCount = 0;
while (skipCount < skipLength) {
++skipCount;
result = DiscardField(result.next, end);
if (result.endOfRecord()) {
// Found the end of record early, we can reuse the error code so just return it
return result;
// Found the end of record early
break;
}
--skipLength;
}
if (fieldsSkipped != nullptr) {
*fieldsSkipped = skipCount;
}
return result;
}
GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength)
GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength, unsigned *recordsSkipped)
{
GetFieldResult result { begin };
while (skipLength > 0) {
unsigned skipCount = 0;
while (skipCount < skipLength) {
++skipCount;
result = DiscardRemainingFields(result.next, end);
if (result.endOfFile()) {
// Found the end of file early, we can reuse the error code so just return it
return result;
// Found the end of file early
break;
}
--skipLength;
}
if (recordsSkipped != nullptr) {
*recordsSkipped = skipCount;
}
return result;
}

66
Source/data/parser.hpp

@ -16,9 +16,10 @@ struct GetFieldResult {
ReadyToRead,
EndOfField,
EndOfRecord,
BadRecordSeparator,
BadRecordTerminator,
EndOfFile,
FileTruncated
FileTruncated,
NoFinalTerminator
} status
= Status::ReadyToRead;
@ -53,7 +54,7 @@ struct GetFieldResult {
*/
[[nodiscard]] bool endOfRecord() const
{
return IsAnyOf(status, Status::EndOfRecord, Status::BadRecordSeparator) || endOfFile();
return IsAnyOf(status, Status::EndOfRecord, Status::BadRecordTerminator) || endOfFile();
}
/**
@ -61,16 +62,16 @@ struct GetFieldResult {
*/
[[nodiscard]] bool endOfFile() const
{
return IsAnyOf(status, Status::EndOfFile, Status::FileTruncated);
return IsAnyOf(status, Status::EndOfFile, Status::FileTruncated, Status::NoFinalTerminator);
}
};
/**
* @brief Checks if this character is potentially part of a record separator sequence
* @brief Checks if this character is potentially part of a record terminator sequence
* @param c character to check
* @return true if it's a record separator (lf) or carriage return (cr, accepted as the start of a crlf pair)
* @return true if it's a record terminator (lf) or carriage return (cr, accepted as the start of a crlf pair)
*/
constexpr bool IsRecordSeparator(char c)
constexpr bool IsRecordTerminator(char c)
{
return c == '\r' || c == '\n';
}
@ -78,34 +79,33 @@ constexpr bool IsRecordSeparator(char c)
/**
* @brief Checks if this character is a field separator
*
* Note that record separator sequences also act to delimit fields
* Note that record terminator sequences also act to separate fields
* @param c character to check
* @return true if it's a field separator (tab) or part of a record separator sequence
* @return true if it's a field separator (tab) or part of a record terminator sequence
*/
constexpr bool IsFieldSeparator(char c)
{
return c == '\t' || IsRecordSeparator(c);
return c == '\t' || IsRecordTerminator(c);
}
/**
* @brief Consumes the current record separator sequence and returns a result describing whether at least one more record is available.
* @brief Consumes the current record terminator sequence and returns a result describing whether at least one more record is available.
*
* Assumes that begin points to a record separator (cr, lf, or EOF). If we reached the end of the
* stream (endOfField == end) then `status` will compare equal to
* GetFieldResult::Status::EndOfFile. Otherwise if a valid record separator sequence was found (lf
* or crlf) then the `status` equals GetFieldResult::Status::EndOfRecord. If we found a carriage
* return (cr) character just before the end of the stream then it's likely the file was
* truncated, `status` will contain GetFieldResult::Status::FileTruncated. If we found a carriage
* return that is followed by any character other than a newline (lf), or `begin` didn't point to
* a record separator, then `status` will be GetFieldResult::Status::BadRecordSeparator and the
* file has probably been mangled.
* @param begin start of a stream (expected to be pointing to a record separator)
* Assumes that begin points to a record terminator (lf or crlf) or the last read reached the end
* of the final record (in which case the terminator is optional). If we reached the end of the
* stream (`begin == end`) then `status` will compare equal to Status::NoFinalTerminator. If we
* found a carriage return (cr) character just before the end of the stream then it's likely the
* file was truncated, `status` will contain Status::FileTruncated. If we found a carriage return
* that is followed by any character other than a newline (lf), or `begin` didn't point to a record
* terminator, then `status` will be Status::BadRecordTerminator and the file has probably been
* mangled. Otherwise `status` will be Status::EndOfRecord at least one more record is available or
* Status::EndOfFile if this was the end of the last record.
* @param begin start of a stream (expected to be pointing to a record terminator)
* @param end one past the last character of the stream
* @return a struct containing a pointer to the start of the next record (if more characters are
* available) or a copy of end, and optionally a status code describing what type of
* separator was found.
* available) or a copy of end, and a status code describing the terminator.
*/
GetFieldResult HandleRecordSeparator(const char *begin, const char *end);
GetFieldResult HandleRecordTerminator(const char *begin, const char *end);
/**
* @brief Consumes the current field (or record) separator and returns a result describing whether at least one more field is available.
@ -113,7 +113,7 @@ GetFieldResult HandleRecordSeparator(const char *begin, const char *end);
* Assumes that begin points to a field or record separator (tab, cr, lf, or EOF). If there are
* more fields in the current record then the return value will have a pointer to the start of the
* next field and `status` will be GetFieldResult::Status::EndOfField. Otherwise refer to
* HandleRecordSeparator for a description of the different codes.
* HandleRecordTerminator for a description of the different codes.
* @param begin start of a stream (expected to be pointing to a record separator)
* @param end one past the last character of the stream
* @return a struct containing a pointer to the start of the next field (if more characters are
@ -126,7 +126,7 @@ inline GetFieldResult HandleFieldSeparator(const char *begin, const char *end)
return { begin + 1, GetFieldResult::Status::EndOfField };
}
return HandleRecordSeparator(begin, end);
return HandleRecordTerminator(begin, end);
}
/**
@ -148,20 +148,24 @@ inline GetFieldResult DiscardField(const char *begin, const char *end)
* @param begin first character of the stream
* @param end one past the last character in the stream
* @param skipLength how many fields to skip (specifying 0 will cause the method to return without advancing)
* @param fieldsSkipped optional output parameter, will be filled with a count of how many fields
* were skipped in case the end of record was reached early
* @return a GetFieldResult struct containing an empty value, a pointer to the start of the next
* field/record, and a status code describing what type of separator was found
*/
GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength);
GetFieldResult DiscardMultipleFields(const char *begin, const char *end, unsigned skipLength, unsigned *fieldsSkipped = nullptr);
/**
* @brief Advances by the specified number of records or until the end of the file, whichever occurs first
* @param begin first character of the stream
* @param end one past the last character in the stream
* @param skipLength how many records to skip (specifying 0 will cause the method to return without advancing)
* @param recordsSkipped optional output parameter, will be filled with a count of how many records
* were skipped in case the end of file was reached early
* @return a GetFieldResult struct containing an empty value, a pointer to the start of the next
* record, and a status code describing what type of separator was found
*/
GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength);
GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsigned skipLength, unsigned *recordsSkipped = nullptr);
/**
* @brief Discard any remaining fields in the current record
@ -172,9 +176,9 @@ GetFieldResult DiscardMultipleRecords(const char *begin, const char *end, unsign
*/
inline GetFieldResult DiscardRemainingFields(const char *begin, const char *end)
{
const char *nextSeparator = std::find_if(begin, end, IsRecordSeparator);
const char *nextSeparator = std::find_if(begin, end, IsRecordTerminator);
return HandleRecordSeparator(nextSeparator, end);
return HandleRecordTerminator(nextSeparator, end);
}
/**
@ -185,7 +189,7 @@ inline GetFieldResult DiscardRemainingFields(const char *begin, const char *end)
* `next` member of the returned type will be either the start of the next field/record or `end`.
* The `status` member contains additional information to distinguish between the end of a field
* and the end of a record. If there are additional fields in this record then `status` will be
* GetFieldResult::Status::EndOfField, otherwise refer to HandleRecordSeparator for the meanings
* GetFieldResult::Status::EndOfField, otherwise refer to HandleRecordTerminator for the meanings
* associated with the remaining codes.
* @param begin first character of the stream
* @param end one past the last character in the stream

176
Source/playerdat.cpp

@ -45,7 +45,7 @@ public:
[[nodiscard]] uint32_t getThresholdForLevel(unsigned level) const
{
if (level > 0)
return levelThresholds[std::min<size_t>(level - 1, getMaxLevel())];
return levelThresholds[std::min<unsigned>(level - 1, getMaxLevel())];
return 0;
}
@ -64,167 +64,76 @@ public:
}
} ExperienceData;
struct ExperienceColumnDefinition {
enum class ColumnType {
Level,
Experience,
LAST = Experience
} type;
enum class Error {
UnknownColumn
};
// The number of fields between this column and the last one identified as important (or from start of the record if this is the first column we care about)
unsigned skipLength;
static tl::expected<ColumnType, Error> mapNameToType(std::string_view name)
{
if (name == "Level") {
return ColumnType::Level;
}
if (name == "Experience") {
return ColumnType::Experience;
}
return tl::unexpected { Error::UnknownColumn };
}
ExperienceColumnDefinition() = delete;
ExperienceColumnDefinition(const ColumnType &type)
: type(type)
, skipLength(0)
{
}
enum class ExperienceColumn {
Level,
Experience,
LAST = Experience
};
ExperienceColumnDefinition(const ColumnType &type, unsigned skipLength)
: type(type)
, skipLength(skipLength)
{
tl::expected<ExperienceColumn, ColumnDefinition::Error> mapExperienceColumnFromName(std::string_view name)
{
if (name == "Level") {
return ExperienceColumn::Level;
}
bool operator==(const ExperienceColumnDefinition &other) const
{
return type == other.type && skipLength == other.skipLength;
if (name == "Experience") {
return ExperienceColumn::Experience;
}
};
return tl::unexpected { ColumnDefinition::Error::UnknownColumn };
}
void ReloadExperienceData()
{
constexpr std::string_view filename = "txtdata\\Experience.tsv";
auto dataFileResult = DataFile::load(filename);
if (!dataFileResult.has_value()) {
app_fatal(fmt::format(fmt::runtime(_(/* TRANSLATORS: Error message when a data file is missing or corrupt */ "Unable to load player experience data from file {:s}")), filename));
DataFile::reportFatalError(dataFileResult.error(), filename);
}
const DataFile &dataFile = dataFileResult.value();
constexpr unsigned ExpectedColumnCount = enum_size<ExperienceColumnDefinition::ColumnType>::value;
StaticVector<ExperienceColumnDefinition, ExpectedColumnCount> columns;
std::bitset<ExpectedColumnCount> seenColumns;
unsigned currentColumn = 0;
unsigned lastColumn = 0;
RecordsRange records = dataFile.records();
auto record = records.begin();
auto endRecord = records.end();
for (std::string_view field : *record) {
if (columns.size() >= ExpectedColumnCount) {
// All key columns have been identified
break;
}
DataFile &dataFile = dataFileResult.value();
auto columnType = ExperienceColumnDefinition::mapNameToType(field);
if (columnType.has_value() && !seenColumns.test(static_cast<size_t>(columnType.value()))) {
seenColumns.set(static_cast<size_t>(columnType.value()));
unsigned skipColumns = 0;
if (currentColumn > lastColumn)
skipColumns = currentColumn - lastColumn - 1;
columns.emplace_back(columnType.value(), skipColumns);
lastColumn = currentColumn;
}
++currentColumn;
}
++record;
if (record == endRecord) {
// The data file ended after the header, since there's no data we can't proceed
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when a data file is empty or only contains the header row */
"{:s} is incomplete, please check the file contents.")),
filename));
}
constexpr unsigned ExpectedColumnCount = enum_size<ExperienceColumn>::value;
std::array<ColumnDefinition, ExpectedColumnCount> columns;
auto parseHeaderResult = dataFile.parseHeader<ExperienceColumn>(columns.data(), columns.data() + columns.size(), mapExperienceColumnFromName);
if (columns.size() < ExpectedColumnCount) {
// The data file doesn't have the required headers. Though we could potentially just allocate
// missing columns in the default order that's likely to lead to further corruption in saves
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when a data file doesn't contain the expected columns */
"Your {:s} file doesn't have the expected columns, please make sure it matches the documented format.")),
filename));
if (!parseHeaderResult.has_value()) {
DataFile::reportFatalError(parseHeaderResult.error(), filename);
}
ExperienceData.clear();
unsigned row = 1; // current line/record number for error messages
while (record != endRecord) {
for (DataFileRecord record : dataFile) {
uint8_t level = 0;
uint32_t experience = 0;
bool skipRecord = false;
FieldsInRecordRange fields = *record;
auto field = fields.begin();
auto endField = fields.end();
unsigned col = 0; // current field number for error messages
FieldIterator fieldIt = record.begin();
FieldIterator endField = record.end();
for (auto &column : columns) {
col += column.skipLength;
field += column.skipLength;
fieldIt += column.skipLength;
if (field == endField) {
// reached the end of record early, this could be from a trailing newline so don't throw an error
skipRecord = true;
break;
if (fieldIt == endField) {
DataFile::reportFatalError(DataFile::Error::NotEnoughColumns, filename);
}
switch (column.type) {
case ExperienceColumnDefinition::ColumnType::Level: {
DataFileField field = *fieldIt;
switch (static_cast<ExperienceColumn>(column)) {
case ExperienceColumn::Level: {
auto parseIntResult = field.parseInt(level);
if (parseIntResult == std::errc::invalid_argument) {
// not a signless numeric value, is this a trailing newline or the MaxLevel line?
if (field.endOfFile() || *field == "MaxLevel") {
if (parseIntResult != std::errc()) {
if (*field == "MaxLevel") {
skipRecord = true;
} else {
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Level column */
"Expected a positive numeric value for Level in {:s}, found {:s} at row {:d} and column {:d}")),
filename, *field, row, col));
DataFile::reportFatalFieldError(parseIntResult, filename, "Level", field);
}
} else if (parseIntResult == std::errc::result_out_of_range) {
// a level greater than 255 was provided
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Level column */
"Levels above {:d} are not supported, out of range value in {:s} at row {:d} and column {:d}")),
std::numeric_limits<uint8_t>::max(), filename, row, col));
}
} break;
case ExperienceColumnDefinition::ColumnType::Experience: {
case ExperienceColumn::Experience: {
auto parseIntResult = field.parseInt(experience);
if (parseIntResult == std::errc::invalid_argument) {
// not a signless numeric value, is this a trailing newline?
if (field.endOfFile()) {
skipRecord = true;
} else {
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Experience column */
"Expected a positive numeric value for Experience in {:s}, found {:s} at row {:d} and column {:d}")),
filename, *field, row, col));
}
} else if (parseIntResult == std::errc::result_out_of_range) {
// an experience threshold greater than 2^32-1 was provided
app_fatal(fmt::format(fmt::runtime(_(
/* TRANSLATORS: Error message when parsing the Experience data file and a text value is encountered in the Experience column */
"Experience thresholds above {:d} are not supported, out of range value in {:s} at row {:d} and column {:d}")),
std::numeric_limits<uint32_t>::max(), filename, row, col));
if (parseIntResult != std::errc()) {
DataFile::reportFatalFieldError(parseIntResult, filename, "Experience", field);
}
} break;
@ -235,14 +144,11 @@ void ReloadExperienceData()
if (skipRecord)
break;
++field;
++col;
++fieldIt;
}
if (!skipRecord)
ExperienceData.setThresholdForLevel(level, experience);
++record;
++row;
}
}

4
test/Fixtures.cmake

@ -86,7 +86,11 @@ set(devilutionx_fixtures
timedemo/WarriorLevel1to2/spawn_0.sv
txtdata/cr.tsv
txtdata/crlf.tsv
txtdata/empty.tsv
txtdata/empty_with_utf8_bom.tsv
txtdata/lf.tsv
txtdata/lf_no_trail.tsv
txtdata/sample.tsv
txtdata/utf8_bom.tsv
)

291
test/data_file_test.cpp

@ -22,14 +22,15 @@ void TestFileContents(
GetFieldResult::Status expectedEndOfRecordStatus = GetFieldResult::Status::EndOfRecord,
GetFieldResult::Status expectedEndOfFileStatus = GetFieldResult::Status::EndOfFile)
{
GetFieldResult result { dataFile.begin() };
GetFieldResult result { dataFile.data() };
const char *end = dataFile.data() + dataFile.size();
unsigned row = 0;
do {
ASSERT_LT(row, expectedContent.size()) << "Too many records";
unsigned col = 0;
do {
ASSERT_LT(col, expectedContent[row].size()) << "Too many fields in record " << row;
result = GetNextField(result.next, dataFile.end());
result = GetNextField(result.next, end);
EXPECT_EQ(result.value, expectedContent[row][col]) << "Unexpected value at record " << row << " and field " << col;
col++;
} while (!result.endOfRecord());
@ -65,10 +66,9 @@ TEST(DataFileTest, LoadCRFile)
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" }
// because the file does not end with a newline parsing stops at the previous record
};
TestFileContents(dataFile, expectedFields, GetFieldResult::Status::BadRecordSeparator, GetFieldResult::Status::FileTruncated);
TestFileContents(dataFile, expectedFields, GetFieldResult::Status::BadRecordTerminator, GetFieldResult::Status::FileTruncated);
}
TEST(DataFileTest, LoadWindowsFile)
@ -84,7 +84,6 @@ TEST(DataFileTest, LoadWindowsFile)
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
{ "" } // file ends with a newline, parsing returns a single empty field
};
TestFileContents(dataFile, expectedFields);
@ -103,12 +102,77 @@ TEST(DataFileTest, LoadTypicalFile)
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
{ "" } // file ends with a newline, parsing returns a single empty field
};
TestFileContents(dataFile, expectedFields);
}
TEST(DataFileTest, LoadFileWithNoTrailingNewline)
{
auto result = LoadDataFile("txtdata\\lf_no_trail.tsv");
ASSERT_TRUE(result.has_value()) << "Unable to load lf_no_trail.tsv";
DataFile &dataFile = result.value();
EXPECT_EQ(dataFile.size(), 32) << "File size should be reported in code units";
std::vector<std::vector<std::string_view>> expectedFields {
{ "Test", "Empty", "Values" },
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
};
TestFileContents(dataFile, expectedFields, GetFieldResult::Status::EndOfRecord, GetFieldResult::Status::NoFinalTerminator);
}
std::string_view mapError(DataFile::Error error)
{
switch (error) {
case DataFile::Error::NotFound:
return "not found";
case DataFile::Error::OpenFailed:
return "cannot open";
case DataFile::Error::BadRead:
return "cannot read contents";
default:
return "unexpected error";
}
}
TEST(DataFileTest, LoadEmptyFile)
{
auto result = LoadDataFile("txtdata\\empty.tsv");
if (!result.has_value()) {
FAIL() << "Unable to load empty.tsv, error: " << mapError(result.error());
}
DataFile &dataFile = result.value();
EXPECT_EQ(dataFile.size(), 0) << "File size should be reported in code units";
std::vector<std::vector<std::string_view>> expectedFields {
{ "" },
};
TestFileContents(dataFile, expectedFields, GetFieldResult::Status::NoFinalTerminator, GetFieldResult::Status::NoFinalTerminator);
}
TEST(DataFileTest, LoadEmptyFileWithBOM)
{
auto result = LoadDataFile("txtdata\\empty_with_utf8_bom.tsv");
if (!result.has_value()) {
FAIL() << "Unable to load empty_with_utf8_bom.tsv, error: " << mapError(result.error());
}
DataFile &dataFile = result.value();
EXPECT_EQ(dataFile.size(), 0) << "Loading a file containing a UTF8 byte order marker should strip that prefix";
std::vector<std::vector<std::string_view>> expectedFields {
{ "" },
};
TestFileContents(dataFile, expectedFields, GetFieldResult::Status::NoFinalTerminator, GetFieldResult::Status::NoFinalTerminator);
}
TEST(DataFileTest, LoadUtf8WithBOM)
{
auto result = LoadDataFile("txtdata\\utf8_bom.tsv");
@ -122,12 +186,74 @@ TEST(DataFileTest, LoadUtf8WithBOM)
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
{ "" } // file ends with a newline, parsing returns a single empty field
};
TestFileContents(dataFile, expectedFields);
}
TEST(DataFileTest, ParseInt)
{
auto result = LoadDataFile("txtdata\\sample.tsv");
if (!result.has_value()) {
FAIL() << "Unable to load sample.tsv, error: " << mapError(result.error());
}
DataFile &dataFile = result.value();
auto unused = dataFile.parseHeader(nullptr, nullptr, [](std::string_view) -> tl::expected<uint8_t, ColumnDefinition::Error> { return tl::unexpected { ColumnDefinition::Error::UnknownColumn }; });
EXPECT_TRUE(unused.has_value()) << "Should be able to parse and discard the header from the sample.tsv file";
for (DataFileRecord record : dataFile) {
auto fieldIt = record.begin();
auto end = record.end();
ASSERT_NE(fieldIt, end) << "sample.tsv must contain at least one field to use as a test value for strings";
// First field is a string that doesn't start with a digit or - character
DataFileField field = *fieldIt;
uint8_t shortVal = 5;
auto parseIntResult = field.parseInt(shortVal);
EXPECT_EQ(parseIntResult, std::errc::invalid_argument) << "Strings are not uint8_t values";
EXPECT_EQ(shortVal, 5) << "Value is not modified when parsing as uint8_t fails due to non-numeric fields";
EXPECT_EQ(*field, "Sample") << "Should be able to access the field value as a string after failure";
++fieldIt;
ASSERT_NE(fieldIt, end) << "sample.tsv must contain a second field to use as a test value for small ints";
// Second field is a number that fits into an uint8_t value
field = *fieldIt;
shortVal = 5;
parseIntResult = field.parseInt(shortVal);
EXPECT_EQ(parseIntResult, std::errc()) << "Expected " << field << "to fit into a uint8_t variable";
EXPECT_EQ(shortVal, 145) << "Parsing should give the expected base 10 value";
EXPECT_EQ(*field, "145") << "Should be able to access the field value as a string even after parsing as an int";
++fieldIt;
ASSERT_NE(fieldIt, end) << "sample.tsv must contain a third field to use as a test value for large ints";
// Third field is a number too large for a uint8_t but that fits into an int value
field = *fieldIt;
parseIntResult = field.parseInt(shortVal);
EXPECT_EQ(parseIntResult, std::errc::result_out_of_range) << "A value too large to fit into a uint8_t variable should report an error";
EXPECT_EQ(shortVal, 145) << "Value is not modified when parsing as uint8_t fails due to out of range value";
int longVal = 42;
parseIntResult = field.parseInt(longVal);
EXPECT_EQ(parseIntResult, std::errc()) << "Expected " << field << "to fit into an int variable";
EXPECT_EQ(longVal, 70322) << "Value is expected to be parsed into a larger type after an out of range failure";
EXPECT_EQ(*field, "70322") << "Should be able to access the field value as a string after parsing as an int";
++fieldIt;
ASSERT_NE(fieldIt, end) << "sample.tsv must contain a fourth field to use as a test value for fields that look like ints";
// Fourth field is not an integer, but a value that starts with one or more digits that fit into an uint8_t value
field = *fieldIt;
parseIntResult = field.parseInt(shortVal);
EXPECT_EQ(parseIntResult, std::errc()) << "Expected " << field << "to fit into a uint8_t variable (even though it's not really an int)";
EXPECT_EQ(shortVal, 6) << "Value is loaded as expected until the first non-digit character";
EXPECT_EQ(*field, "6.34") << "Should be able to access the field value as a string after failure";
}
}
TEST(DataFileTest, IterateOverRecords)
{
auto result = LoadDataFile("txtdata\\lf.tsv");
@ -140,16 +266,51 @@ TEST(DataFileTest, IterateOverRecords)
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
{ "" } // file ends with a newline, parsing returns a single empty field
};
unsigned row = 0;
for (FieldsInRecordRange record : dataFile.records()) {
for (DataFileRecord record : dataFile) {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
unsigned col = 0;
for (DataFileField field : record) {
if (col < expectedFields[row].size())
EXPECT_EQ(*field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col;
else
ADD_FAILURE() << "Extra value '" << field << "' in record " << row << " at field " << col;
col++;
}
EXPECT_GE(col, expectedFields[row].size()) << "Parsing returned fewer fields than expected in record " << row;
row++;
}
EXPECT_EQ(row, expectedFields.size()) << "Parsing returned fewer records than expected";
}
TEST(DataFileTest, ParseHeaderThenIterateOverRecords)
{
auto result = LoadDataFile("txtdata\\lf.tsv");
ASSERT_TRUE(result.has_value()) << "Unable to load lf.tsv";
DataFile &dataFile = result.value();
std::vector<std::vector<std::string_view>> expectedFields {
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
};
auto parseHeaderResult = dataFile.parseHeader(nullptr, nullptr, [](std::string_view) -> tl::expected<uint8_t, ColumnDefinition::Error> { return tl::unexpected { ColumnDefinition::Error::UnknownColumn }; });
EXPECT_TRUE(parseHeaderResult.has_value()) << "Expected to be able to parse and discard the header record";
unsigned row = 0;
for (DataFileRecord record : dataFile) {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
EXPECT_EQ(row + 1, record.row()) << "DataFileRecord (through iterator) should report a 1-indexed row after parsing the header record";
unsigned col = 0;
for (std::string_view field : record) {
for (DataFileField field : record) {
EXPECT_EQ(record.row(), field.row()) << "Field should report the same row as the current DataFileRecord";
EXPECT_EQ(col, field.column()) << "Field (through iterator) should report a 0-indexed column";
if (col < expectedFields[row].size())
EXPECT_EQ(field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col;
EXPECT_EQ(*field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col;
else
ADD_FAILURE() << "Extra value '" << field << "' in record " << row << " at field " << col;
col++;
@ -167,19 +328,25 @@ TEST(DataFileTest, DiscardAllAfterFirstField)
DataFile &dataFile = loadDataResult.value();
std::array<std::string_view, 5> expectedFields { "Test", "", "1", "1", "" };
std::vector<std::vector<std::string_view>> expectedFields {
{ "Test", "Empty", "Values" },
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
};
GetFieldResult result { dataFile.begin() };
GetFieldResult result { dataFile.data() };
const char *end = dataFile.data() + dataFile.size();
unsigned row = 0;
do {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
result = GetNextField(result.next, dataFile.end());
EXPECT_EQ(result.value, expectedFields[row]) << "Unexpected first value at record " << row;
result = GetNextField(result.next, end);
EXPECT_EQ(result.value, expectedFields[row][0]) << "Unexpected first value at record " << row;
if (result.endOfRecord() && !result.endOfFile())
ADD_FAILURE() << "Parsing returned fewer fields than expected in record " << row;
else
result = DiscardRemainingFields(result.next, dataFile.end());
result = DiscardRemainingFields(result.next, end);
row++;
} while (!result.endOfFile());
@ -193,13 +360,18 @@ TEST(DataFileTest, DiscardAllAfterFirstFieldIterator)
DataFile &dataFile = result.value();
std::array<std::string_view, 5> expectedFields { "Test", "", "1", "1", "" };
std::vector<std::vector<std::string_view>> expectedFields {
{ "Test", "Empty", "Values" },
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
};
unsigned row = 0;
for (FieldsInRecordRange record : dataFile.records()) {
for (DataFileRecord record : dataFile) {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
for (std::string_view field : record) {
EXPECT_EQ(field, expectedFields[row]) << "Unexpected first value at record " << row;
for (DataFileField field : record) {
EXPECT_EQ(*field, expectedFields[row][0]) << "Field with value " << field << " does not match the expected value for record " << row;
break;
}
row++;
@ -213,23 +385,29 @@ TEST(DataFileTest, DiscardAllUpToLastField)
DataFile &dataFile = loadDataResult.value();
std::array<std::string_view, 5> expectedFields { "Values", "3", "", "3", "" };
std::vector<std::vector<std::string_view>> expectedFields {
{ "Test", "Empty", "Values" },
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
};
GetFieldResult result { dataFile.begin() };
GetFieldResult result { dataFile.data() };
const char *const end = dataFile.data() + dataFile.size();
unsigned row = 0;
do {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
result = DiscardMultipleFields(result.next, dataFile.end(), 2);
if (row < expectedFields.size() - 1) {
result = DiscardMultipleFields(result.next, end, 2);
if (row < expectedFields.size()) {
EXPECT_FALSE(result.endOfRecord()) << "Parsing returned fewer fields than expected in record " << row;
if (!result.endOfRecord())
result = GetNextField(result.next, dataFile.end());
result = GetNextField(result.next, end);
}
EXPECT_EQ(result.value, expectedFields[row]) << "Unexpected last value at record " << row;
EXPECT_EQ(result.value, expectedFields[row][2]) << "Unexpected last value at record " << row;
if (!result.endOfRecord()) {
ADD_FAILURE() << "Parsing returned fewer fields than expected in record " << row;
result = DiscardRemainingFields(result.next, dataFile.end());
result = DiscardRemainingFields(result.next, end);
}
row++;
} while (!result.endOfFile());
@ -244,32 +422,34 @@ TEST(DataFileTest, SkipFieldIterator)
DataFile &dataFile = loadDataResult.value();
// we don't actually test the last value as incrementing the fields-in-record iterator past
// the last field invalidates the value, we can't recover the same way the procedural interface does.
std::array<std::string_view, 5> expectedFields { "Values", "3", "", "3", "" };
std::vector<std::vector<std::string_view>> expectedFields {
{ "Test", "Empty", "Values" },
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
};
RecordsRange recordsRange = dataFile.records();
auto record = recordsRange.begin();
auto endRecord = recordsRange.end();
unsigned row = 0;
while (record != endRecord) {
for (DataFileRecord record : dataFile) {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
FieldsInRecordRange fieldRange = *record;
auto field = fieldRange.begin();
auto endField = fieldRange.end();
field += 2;
if (row < expectedFields.size() - 1) {
if (field == endField) {
auto fieldIt = record.begin();
auto endField = record.end();
fieldIt += 0;
EXPECT_EQ((*fieldIt).value(), expectedFields[row][0]) << "Advancing a field iterator by 0 should not discard any values";
fieldIt += 2;
if (row < expectedFields.size()) {
if (fieldIt == endField) {
ADD_FAILURE() << "Parsing returned fewer fields than expected in record " << row;
} else {
EXPECT_EQ(*field, expectedFields[row]) << "Unexpected last value at record " << row;
++field;
EXPECT_EQ((*fieldIt).value(), expectedFields[row][2]) << "Unexpected last value at record " << row;
++fieldIt;
}
}
EXPECT_EQ(field, endField) << "Parsing returned more fields than expected in record " << row;
EXPECT_EQ(fieldIt, endField) << "Parsing returned more fields than expected in record " << row;
EXPECT_EQ(fieldIt.column(), expectedFields[row].size() - 1) << "Field iterator should report the index of the last column";
++row;
++record;
}
EXPECT_EQ(row, expectedFields.size()) << "Parsing returned fewer records than expected";
@ -283,30 +463,31 @@ TEST(DataFileTest, SkipRowIterator)
DataFile &dataFile = loadDataResult.value();
std::vector<std::vector<std::string_view>> expectedFields {
// skipping the first two lines
{ "Test", "Empty", "Values" },
{ "", "2", "3" },
{ "1", "2", "" },
{ "1", "", "3" },
{ "" } // file ends with a newline, parsing returns a single empty field
};
RecordsRange recordsRange = dataFile.records();
auto record = recordsRange.begin();
auto endRecord = recordsRange.end();
record += 2;
unsigned row = 0;
while (record != endRecord) {
auto recordIt = dataFile.begin();
auto endRecord = dataFile.end();
recordIt += 0;
EXPECT_EQ((*(*recordIt).begin()).value(), expectedFields[0][0]) << "Advancing a record iterator by 0 should not discard any values";
recordIt += 2;
unsigned row = 2;
while (recordIt != endRecord) {
ASSERT_LT(row, expectedFields.size()) << "Too many records";
unsigned col = 0;
for (std::string_view field : *record) {
for (DataFileField field : *recordIt) {
if (col < expectedFields[row].size())
EXPECT_EQ(field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col;
EXPECT_EQ(*field, expectedFields[row][col]) << "Unexpected value at record " << row << " and field " << col;
else
ADD_FAILURE() << "Extra value '" << field << "' in record " << row << " at field " << col;
col++;
}
EXPECT_GE(col, expectedFields[row].size()) << "Parsing returned fewer fields than expected in record " << row;
++row;
++record;
++recordIt;
}
EXPECT_EQ(row, expectedFields.size()) << "Parsing returned fewer records than expected";

0
test/fixtures/txtdata/empty.tsv vendored

1
test/fixtures/txtdata/empty_with_utf8_bom.tsv vendored

@ -0,0 +1 @@


4
test/fixtures/txtdata/lf_no_trail.tsv vendored

@ -0,0 +1,4 @@
Test Empty Values
2 3
1 2
1 3
1 Test Empty Values
2 2 3
3 1 2
4 1 3

2
test/fixtures/txtdata/sample.tsv vendored

@ -0,0 +1,2 @@
String Byte Int Float
Sample 145 70322 6.34
1 String Byte Int Float
2 Sample 145 70322 6.34
Loading…
Cancel
Save