From 37ca4b9f37ecc42e532b3e72eb7c503f9dc750ed Mon Sep 17 00:00:00 2001 From: mrbean-bremen Date: Fri, 19 Dec 2025 08:38:25 +0100 Subject: [PATCH] DcmCharString: add some support for multi-byte characters - add DcmCharString::getVM(), getOFString() and putOFStringAtPos(), which handle multi-byte charsets - DcmByteString::containsExtendedCharacters(): add check for ESCAPE characters (only allowed in code extensions) - removed obsolete DcmCharString::containsExtendedCharacters() --- dcmdata/include/dcmtk/dcmdata/dcbytstr.h | 45 +++-- dcmdata/include/dcmtk/dcmdata/dcchrstr.h | 41 +++- dcmdata/include/dcmtk/dcmdata/dcitem.h | 9 +- dcmdata/include/dcmtk/dcmdata/dcsequen.h | 9 +- dcmdata/include/dcmtk/dcmdata/dcvrlt.h | 15 +- dcmdata/include/dcmtk/dcmdata/dcvrst.h | 14 +- dcmdata/include/dcmtk/dcmdata/dcvrut.h | 14 +- dcmdata/libsrc/dcbytstr.cc | 55 ++++-- dcmdata/libsrc/dcchrstr.cc | 238 ++++++++++++++++++++++- dcmdata/libsrc/dcvrlt.cc | 7 - dcmdata/libsrc/dcvrst.cc | 7 - dcmdata/libsrc/dcvrut.cc | 7 - dcmdata/tests/CMakeLists.txt | 1 + dcmdata/tests/tchrstr.cc | 224 +++++++++++++++++++++ dcmdata/tests/tchval.cc | 10 +- dcmdata/tests/tests.cc | 5 + 16 files changed, 598 insertions(+), 103 deletions(-) create mode 100644 dcmdata/tests/tchrstr.cc diff --git a/dcmdata/include/dcmtk/dcmdata/dcbytstr.h b/dcmdata/include/dcmtk/dcmdata/dcbytstr.h index 17a6225820..add6270170 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcbytstr.h +++ b/dcmdata/include/dcmtk/dcmdata/dcbytstr.h @@ -274,20 +274,19 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement */ virtual OFCondition verify(const OFBool autocorrect = OFFalse); - /** check if this element contains non-ASCII characters. Please note that this check - * is pretty simple and only works for single-byte character sets that do include - * the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character - * codes below 128 are considered to be ASCII codes and all others are considered to - * be non-ASCII. + /** check if this element contains non-ASCII characters. + * This works by checking for any byte values above 127, which works for any + * single-byte code and for single-value multi-byte codes, and for ESC characters, + * which will mean that a code extension is used. * @param checkAllStrings if true, also check elements with string values not affected * by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST, - * UC and UT, i.e. none of the derived VR classes. + * UC and UT. * @return true if element contains non-ASCII characters, false otherwise */ virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse); /** check if this element is affected by SpecificCharacterSet - * @return always returns false since none of the derived VR classes is affected by + * @return returns false, overwritten by derived VR classes that are affected by * the SpecificCharacterSet (0008,0005) element */ virtual OFBool isAffectedBySpecificCharacterSet() const; @@ -379,6 +378,20 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement */ virtual OFCondition makeMachineByteString(const Uint32 length = 0); + /** check if the VR supports more than one value. + * @return OFTrue + */ + virtual OFBool supportsMultiValue() const { return OFTrue; } + + /** find the start index of the next component. + * @param str pointer to the string value to be searched + * @param len the length of @a str + * @param start the start character index for the search + * @param charSet the value of Specific Character Set; not used + * @return the index of the next value, or OFString_npos if none exists. + */ + virtual size_t findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const; + /** convert currently stored string value to DICOM representation. * It removes trailing spaces apart from a possibly required single padding * character (in case of odd string length). @@ -418,13 +431,23 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement */ void setNonSignificantChars(const OFString &characters) { nonSignificantChars = characters; } + /** set element value at a specific value position in the given character string, + * considering the specific character set for finding the position, if given. + * @param stringVal input character string (possibly multi-valued) + * @param pos position (0..vm) where the value should be inserted + * @param charSet the value of the Specific Character Set + * @return status, EC_Normal if successful, an error code otherwise + */ + OFCondition putOFStringAtPosWithCharset(const OFString& stringVal, + const unsigned long pos, + const OFString& charSet); + /* --- static helper functions --- */ /** check if a given character string contains non-ASCII characters. - * Please note that this check is pretty simple and only works for single-byte character - * sets that do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other - * words: All character codes below 128 are considered to be ASCII codes and all others - * are considered to be non-ASCII. + * This works by checking for any byte values above 127, which works for any + * single-byte code and for single-value multi-byte codes, and for ESC characters, + * which will mean that a code extension is used. * @param stringVal character string to be checked * @param stringLen length of the string (number of characters without the trailing * NULL byte) diff --git a/dcmdata/include/dcmtk/dcmdata/dcchrstr.h b/dcmdata/include/dcmtk/dcmdata/dcchrstr.h index ccc2542212..521c6d1988 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcchrstr.h +++ b/dcmdata/include/dcmtk/dcmdata/dcchrstr.h @@ -107,17 +107,31 @@ class DCMTK_DCMDATA_EXPORT DcmCharString */ virtual OFCondition verify(const OFBool autocorrect = OFFalse); - /** check if this element contains non-ASCII characters. Please note that this check - * is pretty simple and only works for single-byte character sets that do include - * the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character - * codes below 128 are considered to be ASCII codes and all others are considered to - * be non-ASCII. - * @param checkAllStrings not used in this class - * @return true if element contains non-ASCII characters, false otherwise + /** get value multiplicity + * @return number of string components (separated by a backslash) */ - virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse); + virtual unsigned long getVM(); - /** check if this element is affected by SpecificCharacterSet + /** get a copy of a particular string component + * @param stringVal variable in which the result value is stored + * @param pos index of the value in case of multi-valued elements (0..vm-1) + * @param normalize not used since string normalization depends on value representation + * @return status, EC_Normal if successful, an error code otherwise + */ + virtual OFCondition getOFString(OFString &stringVal, + const unsigned long pos, + OFBool normalize = OFTrue); + + + /** set element value at specific VM position in the given character string. + * @param stringVal input character string (possibly multi-valued) + * @param pos position (0..vm) where the value should be inserted + * @return status, EC_Normal if successful, an error code otherwise + */ + virtual OFCondition putOFStringAtPos(const OFString& stringVal, + const unsigned long pos = 0); + + /** check if this element is affected by SpecificCharacterSet * @return always returns true since all derived VR classes are affected by the * SpecificCharacterSet (0008,0005) element */ @@ -169,6 +183,15 @@ class DCMTK_DCMDATA_EXPORT DcmCharString */ virtual const OFString& getDelimiterChars() const; + /** find the start index of the next value in a multi-valued attribute. + * @param str pointer to the string value to be searched + * @param len the length of @a str + * @param start the start character index for the search + * @param charSet the value of Specific Character Set; if not set, single-byte encoding is assumed + * @return the index of the next value, or OFString_npos if none exists. + */ + virtual size_t findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const; + }; diff --git a/dcmdata/include/dcmtk/dcmdata/dcitem.h b/dcmdata/include/dcmtk/dcmdata/dcitem.h index 84d2b6f3bf..e44b133723 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcitem.h +++ b/dcmdata/include/dcmtk/dcmdata/dcitem.h @@ -317,11 +317,10 @@ class DCMTK_DCMDATA_EXPORT DcmItem */ virtual OFBool containsUnknownVR() const; - /** check if this object contains non-ASCII characters at any nesting level. Please note - * that this check is pretty simple and only works for single-byte character sets that - * do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All - * character codes below 128 are considered to be ASCII codes and all others are - * considered to be non-ASCII. + /** check if this object contains non-ASCII characters. + * This works by checking for any byte values above 127, which works for any + * single-byte code and for single-value multi-byte codes, and for ESC characters, + * which will mean that a code extension is used. * @param checkAllStrings if true, also check elements with string values not affected * by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST, * UC and UT. diff --git a/dcmdata/include/dcmtk/dcmdata/dcsequen.h b/dcmdata/include/dcmtk/dcmdata/dcsequen.h index 0c41721fc0..c93d76c6f9 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcsequen.h +++ b/dcmdata/include/dcmtk/dcmdata/dcsequen.h @@ -313,11 +313,10 @@ class DCMTK_DCMDATA_EXPORT DcmSequenceOfItems : public DcmElement */ virtual OFBool containsUnknownVR() const; - /** check if this object contains non-ASCII characters at any nesting level. Please note - * that this check is pretty simple and only works for single-byte character sets that - * do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All - * character codes below 128 are considered to be ASCII codes and all others are - * considered to be non-ASCII. + /** check if this object contains non-ASCII characters. + * This works by checking for any byte values above 127, which works for any + * single-byte code and for single-value multi-byte codes, and for ESC characters, + * which will mean that a code extension is used. * @param checkAllStrings if true, also check elements with string values not affected * by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST, * UC and UT. diff --git a/dcmdata/include/dcmtk/dcmdata/dcvrlt.h b/dcmdata/include/dcmtk/dcmdata/dcvrlt.h index 846cada889..ca3e3686b6 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcvrlt.h +++ b/dcmdata/include/dcmtk/dcmdata/dcvrlt.h @@ -114,13 +114,6 @@ class DCMTK_DCMDATA_EXPORT DcmLongText virtual OFCondition checkValue(const OFString &vm = "", const OFBool oldFormat = OFFalse); - /** get the value multiplicity. - * Since the backslash "\" is not regarded as a separator the value - * multiplicity is always 1. - * @return value multiplicity of the currently stored value - */ - virtual unsigned long getVM(); - /** get a copy of a particular string component * @param stringVal variable in which the result value is stored * @param pos index of the value in case of multi-valued elements (0..vm-1) @@ -151,6 +144,14 @@ class DCMTK_DCMDATA_EXPORT DcmLongText */ static OFCondition checkStringValue(const OFString &value, const OFString &charset = ""); + + protected: + /** check if the VR supports more than one value. + * Since the backslash "\" is not regarded as a separator, + * multiple values cannot be encoded. + * @return OFFalse + */ + virtual OFBool supportsMultiValue() const { return OFFalse; }; }; diff --git a/dcmdata/include/dcmtk/dcmdata/dcvrst.h b/dcmdata/include/dcmtk/dcmdata/dcvrst.h index e012a46910..3b04499e59 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcvrst.h +++ b/dcmdata/include/dcmtk/dcmdata/dcvrst.h @@ -115,13 +115,6 @@ class DCMTK_DCMDATA_EXPORT DcmShortText virtual OFCondition checkValue(const OFString &vm = "", const OFBool oldFormat = OFFalse); - /** get the value multiplicity. - * Since the backslash "\" is not regarded as a separator the value - * multiplicity is always 1. - * @return value multiplicity of the currently stored value - */ - virtual unsigned long getVM(); - /** get a copy of a particular string component * @param stringVal variable in which the result value is stored * @param pos index of the value in case of multi-valued elements (0..vm-1) @@ -152,6 +145,13 @@ class DCMTK_DCMDATA_EXPORT DcmShortText */ static OFCondition checkStringValue(const OFString &value, const OFString &charset = ""); +protected: + /** check if the VR supports more than one value. + * Since the backslash "\" is not regarded as a separator, + * multiple values cannot be encoded. + * @return OFFalse + */ + virtual OFBool supportsMultiValue() const { return OFFalse; }; }; diff --git a/dcmdata/include/dcmtk/dcmdata/dcvrut.h b/dcmdata/include/dcmtk/dcmdata/dcvrut.h index 3b8121b9cd..17341197ec 100644 --- a/dcmdata/include/dcmtk/dcmdata/dcvrut.h +++ b/dcmdata/include/dcmtk/dcmdata/dcvrut.h @@ -117,13 +117,6 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText virtual OFCondition checkValue(const OFString &vm = "", const OFBool oldFormat = OFFalse); - /** get the value multiplicity. - * Since the backslash "\" is not regarded as a separator the value - * multiplicity is always 1. - * @return value multiplicity of the currently stored value - */ - virtual unsigned long getVM(); - /** get a copy of a particular string component * @param stringVal variable in which the result value is stored * @param pos index of the value in case of multi-valued elements (0..vm-1) @@ -154,6 +147,13 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText */ static OFCondition checkStringValue(const OFString &value, const OFString &charset = ""); +protected: + /** check if the VR supports more than one value. + * Since the backslash "\" is not regarded as a separator, + * multiple values cannot be encoded. + * @return OFFalse + */ + virtual OFBool supportsMultiValue() const { return OFFalse; }; }; diff --git a/dcmdata/libsrc/dcbytstr.cc b/dcmdata/libsrc/dcbytstr.cc index 73f77375fd..92385658f7 100644 --- a/dcmdata/libsrc/dcbytstr.cc +++ b/dcmdata/libsrc/dcbytstr.cc @@ -467,9 +467,8 @@ OFCondition DcmByteString::putString(const char *stringVal, return errorFlag; } - -OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal, - const unsigned long pos) +OFCondition DcmByteString::putOFStringAtPosWithCharset(const OFString& stringVal, const unsigned long pos, + const OFString& charSet) { OFCondition result; // Get old value @@ -505,8 +504,8 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal, // First value is set: Replace old value with new value else { - rightPos = str.find_first_of('\\', 0); - str = str.replace(0, rightPos, stringVal); + rightPos = findNextValuePosition(str.c_str(), str.length(), 0, charSet); + str = str.replace(0, rightPos - 1, stringVal); } return putOFStringArray(str); } @@ -514,29 +513,25 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal, // 3rd case: New value should be inserted somewhere in the middle size_t leftPos = 0; size_t vmPos = 0; + size_t strLen = str.length(); // First, find the correct position, and then insert / replace new value do { // Step from value to value by looking for delimiters. - // Special handling first search (start looking at position 0 instead of 1) - if (vmPos == 0) leftPos = str.find('\\', 0); - else leftPos = str.find('\\', leftPos + 1 ); - // leftPos = str.find('\\', leftPos == 0 ? 0 : leftPos +1); + leftPos = findNextValuePosition(str.c_str(), strLen, leftPos, charSet); if (leftPos != OFString_npos) - { vmPos++; - } } while ( (leftPos != OFString_npos) && (vmPos != pos) ); - rightPos = str.find_first_of('\\', leftPos+1); - if (rightPos == OFString_npos) rightPos = str.length(); + rightPos = findNextValuePosition(str.c_str(), strLen, leftPos, charSet); + if (rightPos == OFString_npos) rightPos = strLen + 1; // If we do not have an old value of size 1 or we have an empty value if (rightPos - leftPos == 1) { // Empty value if (str.at(leftPos) == '\\') - str = str.insert(rightPos, stringVal); + str = str.insert(leftPos, stringVal); // Old value (length 1) else str = str.replace(leftPos, 1, stringVal); @@ -544,7 +539,7 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal, // Otherwise replace existing old value (length > 1) else { - str = str.replace(leftPos+1, rightPos - leftPos - 1, stringVal); + str = str.replace(leftPos, rightPos - leftPos - 1, stringVal); } // Finally re-insert all values include new value result = putOFStringArray( str ); @@ -553,6 +548,28 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal, } + +OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal, + const unsigned long pos) +{ + return putOFStringAtPosWithCharset(stringVal, pos, ""); +} + + +// ******************************** + + +size_t DcmByteString::findNextValuePosition(const char* str, size_t len, size_t start, const OFString& /*charSet*/) const +{ + const char *p = str + start; + for (size_t i = start; i < len; ++i) + { + if (*p++ == '\\') + return i + 1; + } + return OFString_npos; +} + // ******************************** @@ -770,7 +787,7 @@ OFBool DcmByteString::containsExtendedCharacters(const OFBool checkAllStrings) OFBool result = OFFalse; /* only check if parameter is true since derived VRs are not affected by the attribute SpecificCharacterSet (0008,0005) */ - if (checkAllStrings) + if (checkAllStrings || isAffectedBySpecificCharacterSet()) { char *str = NULL; Uint32 len = 0; @@ -876,10 +893,10 @@ OFBool DcmByteString::containsExtendedCharacters(const char *stringVal, { if (stringVal != NULL) { - for (size_t i = stringLen; i != 0; --i) + for (size_t i = stringLen; i != 0; --i, ++stringVal) { - /* check for 8 bit characters */ - if (OFstatic_cast(unsigned char, *stringVal++) > 127) + /* check for 8 bit and Escape characters */ + if ((*stringVal & 0x80) != 0 || (*stringVal == 0x1b)) return OFTrue; } } diff --git a/dcmdata/libsrc/dcchrstr.cc b/dcmdata/libsrc/dcchrstr.cc index a10d49d99e..0c50961903 100644 --- a/dcmdata/libsrc/dcchrstr.cc +++ b/dcmdata/libsrc/dcchrstr.cc @@ -45,6 +45,85 @@ #include "dcmtk/dcmdata/dcchrstr.h" +static unsigned long getMaximumNumberOfValues(const OFString& s, Uint32 len) +{ + // a byte representing a backslash may also be part of a multi-byte character, + // so the found value may be higher than the real VM + unsigned long vm = 1; + const char *p = s.c_str(); + for (size_t i = 0; i < len; i++) + if (*p++ == '\\') + ++vm; + return vm; +} + +static OFCondition getOFStringAtIndex( + OFString& stringVal, const unsigned long pos, const char *str, Uint32 len) +{ + // works for single-byte encodings + const char *p = str; + const char *start = str; + unsigned long vm = 1; + for (size_t i = 0; i < len; i++) + { + if (*p++ == '\\') + { + if (pos == vm) + start = p; + else if (pos + 1 == vm) + { + stringVal.assign(start, p - 1); + return EC_Normal; + } + ++vm; + } + } + if (pos + 1 == vm) + { + stringVal.assign(start, str + len); + return EC_Normal; + } + if (pos > 0) + return EC_IllegalParameter; + stringVal.clear(); + return EC_Normal; +} + + +// ******************************** + +// helper functions dealing with specific character sets + +static OFBool isMultiValuedCharacterSet(const OFString& charset) +{ + return charset.find('\\') != OFString_npos; +} + +static void skipMultiByteEscapeSequence(const char *&p, size_t &i, size_t len) +{ + if ((*p != 0x1b) || (i >= len - 2)) + return; + + // found an escape sequence, check if it is for a multi-byte encoding + ++i; + // The escape sequence for the following encodings starts with "$": + // ISO 2022 IR 87, ISO 2022 IR 159, ISO 2022 IR 149, ISO 2022 IR 58 + bool isMultiByte = *++p == '$'; + if (!isMultiByte && *p == '-') + { + ++i; + isMultiByte = *++p == 'T'; // ISO 2022 IR 166 + } + if (!isMultiByte) + return; + + // we are inside a part encoded using a multi-byte extension, + // skip until the next escape sequence or the end of the value + while (++i < len - 2 && *p++ != 0x1b) {} +} + +// ******************************** + DcmCharString::DcmCharString(const DcmTag &tag, const Uint32 len) : DcmByteString(tag, len) @@ -140,24 +219,136 @@ OFCondition DcmCharString::verify(const OFBool autocorrect) } -OFBool DcmCharString::containsExtendedCharacters(const OFBool /*checkAllStrings*/) +// ******************************** + + +unsigned long DcmCharString::getVM() { - OFBool result = OFFalse; + // the vast majority of values have VM 0 or 1, so optimize for these char *str = NULL; Uint32 len = 0; - /* determine length in order to support possibly embedded NULL bytes */ - if (getString(str, len).good()) - result = DcmByteString::containsExtendedCharacters(str, len); - return result; + OFCondition result = getString(str, len); + if (!result.good() || (str == NULL) || (len == 0)) + return 0; + + if (!supportsMultiValue()) + return 1; + + unsigned long vm = getMaximumNumberOfValues(str, len); + if (vm == 1 || !containsExtendedCharacters()) + return vm; + + // We have a string containing extended characters and possibly backslashes - + // now we have to get the Specific Character Set to filter out bytes with the + // value for backslash (0x5C) that are part of a multi-byte character. + OFString charset; + result = getSpecificCharacterSet(charset); + if (!result.good() || charset.empty()) + return vm; + + if (isMultiValuedCharacterSet(charset) || + DcmSpecificCharacterSet::isNonASCIIConformMultiByteSingleValueCharacterSet(charset)) + { + vm = 1; + size_t startPos = 0; + size_t valuePos; + while ((valuePos = findNextValuePosition(str, len, startPos, charset)) != OFString_npos) + { + ++vm; + startPos += valuePos; + } + } + + return vm; +} + + +// ******************************** + + +OFCondition DcmCharString::getOFString(OFString& stringVal, const unsigned long pos, OFBool /*normalize*/) +{ + char *str = NULL; + Uint32 len = 0; + OFCondition result = getString(str, len); + if (result.bad()) + return result; + + if ((str == NULL) || (len == 0)) + { + if (pos > 0) + return EC_IllegalParameter; + stringVal.clear(); + return EC_Normal; + } + + if (!supportsMultiValue() || getMaximumNumberOfValues(str, len) == 0) + { + if (pos > 0) + return EC_IllegalParameter; + stringVal.assign(str, str + len); + return EC_Normal; + } + + // only check for multi-byte character sets if the value contains any non-ASCII characters + // oe Escape sequences + if (containsExtendedCharacters()) + { + // We have a string containing extended characters and possibly backslashes - + // now we have to get the Specific Character Set to filter out bytes with the + // value for backslash (0x5C) that are part of a multi-byte character. + OFString charset; + result = getSpecificCharacterSet(charset); + if (result.good() && !charset.empty() && + (isMultiValuedCharacterSet(charset) || + DcmSpecificCharacterSet::isNonASCIIConformMultiByteSingleValueCharacterSet(charset))) + { + unsigned long index = 0; + size_t valuePos = 0; + while (index < pos && + (valuePos = findNextValuePosition(str, len, valuePos, charset)) != OFString_npos) + ++index; + if (valuePos == OFString_npos) + return EC_IllegalParameter; + if (valuePos == len) + stringVal.clear(); + else + { + size_t valueEnd = findNextValuePosition(str, len, valuePos, charset); + if (valueEnd == OFString_npos) + valueEnd = len + 1; + // account for the backslash before the end pointer + stringVal.assign(str + valuePos, str + valueEnd - 1); + } + return EC_Normal; + } + } + // single-byte, single-value encoding, or value without extended characters + return getOFStringAtIndex(stringVal, pos, str, len); +} + +OFCondition DcmCharString::putOFStringAtPos(const OFString& stringVal, const unsigned long pos) +{ + OFString charset; + if (getSpecificCharacterSet(charset).bad()) + charset.clear(); + + return putOFStringAtPosWithCharset(stringVal, pos, charset); } +// ******************************** + + OFBool DcmCharString::isAffectedBySpecificCharacterSet() const { return OFTrue; } +// ******************************** + + OFCondition DcmCharString::convertCharacterSet(DcmSpecificCharacterSet &converter) { char *str = NULL; @@ -274,6 +465,41 @@ const OFString& DcmCharString::getDelimiterChars() const return DcmVR(ident()).getDelimiterChars(); } +size_t DcmCharString::findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const +{ + if (charSet.empty()) + return DcmByteString::findNextValuePosition(str, len, start, charSet); + + const char *p = str + start; + if (DcmSpecificCharacterSet::isNonASCIIConformMultiByteSingleValueCharacterSet(charSet)) + { + // special handling to find real backslashes in chinese multi-bytes encodings; + // the first byte for 2-byte characters, and the first and third bytes of 4-byte + // characters are always > 0x80, so we can exclude these characters + for (size_t i = start; i < len; ++i, ++p) + { + if (*p == '\\') + return i + 1; + if ((*p & 0x80) != 0) + { + // this is a 2-byte character or the first or second part + // of a 4-byte character - skip the next byte + ++p; + ++i; + } + } + return OFString_npos; + } + + for (size_t i = start; i < len; ++i, ++p) + { + if (*p == '\\') + return i + 1; + skipMultiByteEscapeSequence(p, i, len); + } + + return OFString_npos; +} OFBool DcmCharString::isUniversalMatch(const OFBool normalize, const OFBool enableWildCardMatching) diff --git a/dcmdata/libsrc/dcvrlt.cc b/dcmdata/libsrc/dcvrlt.cc index 140703bcd6..3f91ad03e5 100644 --- a/dcmdata/libsrc/dcvrlt.cc +++ b/dcmdata/libsrc/dcvrlt.cc @@ -126,13 +126,6 @@ OFCondition DcmLongText::checkValue(const OFString & /*vm*/, } -unsigned long DcmLongText::getVM() -{ - /* value multiplicity is 1 for non-empty string, 0 otherwise */ - return (getRealLength() > 0) ? 1 : 0; -} - - // ******************************** diff --git a/dcmdata/libsrc/dcvrst.cc b/dcmdata/libsrc/dcvrst.cc index 7d17f5876d..4b4974eb81 100644 --- a/dcmdata/libsrc/dcvrst.cc +++ b/dcmdata/libsrc/dcvrst.cc @@ -126,13 +126,6 @@ OFCondition DcmShortText::checkValue(const OFString & /*vm*/, } -unsigned long DcmShortText::getVM() -{ - /* value multiplicity is 1 for non-empty string, 0 otherwise */ - return (getRealLength() > 0) ? 1 : 0; -} - - // ******************************** diff --git a/dcmdata/libsrc/dcvrut.cc b/dcmdata/libsrc/dcvrut.cc index 0f0a6d048d..0c8c18075e 100644 --- a/dcmdata/libsrc/dcvrut.cc +++ b/dcmdata/libsrc/dcvrut.cc @@ -127,13 +127,6 @@ OFCondition DcmUnlimitedText::checkValue(const OFString & /*vm*/, } -unsigned long DcmUnlimitedText::getVM() -{ - /* value multiplicity is 1 for non-empty string, 0 otherwise */ - return (getRealLength() > 0) ? 1 : 0; -} - - // ******************************** diff --git a/dcmdata/tests/CMakeLists.txt b/dcmdata/tests/CMakeLists.txt index 98ea562ea7..a1dae78fe6 100644 --- a/dcmdata/tests/CMakeLists.txt +++ b/dcmdata/tests/CMakeLists.txt @@ -1,6 +1,7 @@ # declare executables DCMTK_ADD_TEST_EXECUTABLE(dcmdata_tests tbytestr.cc + tchrstr.cc tchval.cc tdict.cc telemlen.cc diff --git a/dcmdata/tests/tchrstr.cc b/dcmdata/tests/tchrstr.cc new file mode 100644 index 0000000000..14e09d7623 --- /dev/null +++ b/dcmdata/tests/tchrstr.cc @@ -0,0 +1,224 @@ +/* + * + * Copyright (C) 2025, OFFIS e.V. + * All rights reserved. See COPYRIGHT file for details. + * + * This software and supporting documentation were developed by + * + * OFFIS e.V. + * R&D Division Health + * Escherweg 2 + * D-26121 Oldenburg, Germany + * + * + * Module: dcmdata + * + * Purpose: test program for DcmCharString and derived classes + * + */ + + +#include +#include +#include +#include +#include + +#include "dcmtk/config/osconfig.h" /* make sure OS specific configuration is included first */ + +#include "dcmtk/ofstd/oftest.h" +#include "dcmtk/dcmdata/dcdatset.h" +#include "dcmtk/dcmdata/dcchrstr.h" +#include "dcmtk/dcmdata/dcdeftag.h" +#include "dcmtk/dcmdata/dcvrlo.h" +#include "dcmtk/dcmdata/dcvrlt.h" + + +OFTEST(dcmdata_charString_derived_getVM) +{ + // backslashes are not delimiters in LT, ST and UT, + // but are in SH, LO, UC and PN + DcmDataset dataset; + DcmLongString* longString = new DcmLongString(DCM_StudyDescription, 0); + dataset.insert(longString); + OFCHECK_EQUAL(longString->getVM(), 0); + longString->putString("One\\Two\\Three"); + OFCHECK_EQUAL(longString->getVM(), 3); + + DcmShortString* shortString = new DcmShortString(DCM_AccessionNumber, 0); + dataset.insert(shortString); + OFCHECK_EQUAL(shortString->getVM(), 0); + shortString->putString("One\\Two\\Three"); + OFCHECK_EQUAL(shortString->getVM(), 3); + + DcmUnlimitedCharacters* unlimitedChars = new DcmUnlimitedCharacters(DCM_GeneticModificationsDescription, 0); + dataset.insert(unlimitedChars); + OFCHECK_EQUAL(unlimitedChars->getVM(), 0); + unlimitedChars->putString("One\\Two\\Three"); + OFCHECK_EQUAL(unlimitedChars->getVM(), 3); + + DcmPersonName* personName = new DcmPersonName(DCM_PatientName, 0); + dataset.insert(personName); + OFCHECK_EQUAL(personName->getVM(), 0); + personName->putString("One\\Two\\Three"); + OFCHECK_EQUAL(personName->getVM(), 3); + + DcmLongText* longText = new DcmLongText(DCM_InventoryPurpose, 0); + dataset.insert(longText); + OFCHECK_EQUAL(0, longText->getVM()); + longText->putString("One\\Two\\Three"); + OFCHECK_EQUAL(1, longText->getVM()); + + DcmShortText* shortText = new DcmShortText(DCM_InventoryPurpose, 0); + dataset.insert(shortText); + OFCHECK_EQUAL(shortText->getVM(), 0); + shortText->putString("One\\Two\\Three"); + OFCHECK_EQUAL(shortText->getVM(), 1); + + DcmUnlimitedText* unlimitedText = new DcmUnlimitedText(DCM_StrainAdditionalInformation, 0); + dataset.insert(unlimitedText); + OFCHECK_EQUAL(unlimitedText->getVM(), 0); + unlimitedText->putString("One\\Two\\Three"); + OFCHECK_EQUAL(unlimitedText->getVM(), 1); +} + +OFTEST(dcmdata_charString_getVM_multibyte) { + DcmDataset dataset; + DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0); + dataset.insert(studyDescr); + + // single-byte/single-value encoding (Latin1) + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100"); + // \x5c is the backslash character + studyDescr->putString("Smith\\\x83\x5c"); + OFCHECK_EQUAL(studyDescr->getVM(), 3); + + // multi-byte/single-value encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030"); + // \x5c is now part of a 2-byte kanji character, not a backslash + OFCHECK_EQUAL(studyDescr->getVM(), 2); + + // single-byte/multi-value encoding (Latin1) + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100\\ISO_IR 126"); + studyDescr->putString("Dionysios=\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2"); + OFCHECK_EQUAL(studyDescr->getVM(), 1); + // backslash inside a single-byte code extension + studyDescr->putString("Dionysios=\x1b\x2d\x46\xc4\xe9\xef\\\xed\xf5\xf3\xe9\xef\xf2"); + OFCHECK_EQUAL(studyDescr->getVM(), 2); + + // code extension with multi-byte encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87"); + studyDescr->putString("One\\Two\\Three"); + OFCHECK_EQUAL(3, studyDescr->getVM()); + // delimiter-like byte in a multi-byte string + studyDescr->putString("Smith=\x1b$BK\\x1b(J"); + OFCHECK_EQUAL(studyDescr->getVM(), 1); +} + +OFTEST(dcmdata_charString_getOFString) { + DcmDataset dataset; + DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0); + dataset.insert(studyDescr); + + // single-byte/single-value encoding (Latin1) + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100"); + // \x5c is the backslash character + OFString stringValue; + studyDescr->putString("John\\\x83\x5cSmith"); + OFCHECK(studyDescr->getOFString(stringValue, 0).good()); + OFCHECK_EQUAL(stringValue, "John"); + OFCHECK(studyDescr->getOFString(stringValue, 1).good()); + OFCHECK_EQUAL(stringValue, "\x83"); + OFCHECK(studyDescr->getOFString(stringValue, 2).good()); + OFCHECK_EQUAL(stringValue, "Smith"); + + + // multi-byte/single-value encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030"); + // \x5c is now part of a 2-byte kanji character, not a backslash + OFCHECK(studyDescr->getOFString(stringValue, 0).good()); + OFCHECK_EQUAL(stringValue, "John"); + OFCHECK(studyDescr->getOFString(stringValue, 1).good()); + OFCHECK_EQUAL(stringValue, "\x83\x5cSmith"); + + // code extension with multi-byte encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87"); + // delimiter-like byte in a multi-byte string + studyDescr->putString("Smith=\x1b$BK\\x1b(J"); + OFCHECK(studyDescr->getOFString(stringValue, 0).good()); + OFCHECK_EQUAL(stringValue, "Smith=\x1b$BK\\x1b(J"); +} + +OFTEST(dcmdata_charString_getOFStringArray) { + DcmDataset dataset; + DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0); + dataset.insert(studyDescr); + + // single-byte/single-value encoding (Latin1) + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100"); + // \x5c is the backslash character + OFString stringValue; + studyDescr->putString("John\\\x83\x5cSmith"); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "John\\\x83\x5cSmith"); + + // multi-byte/single-value encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030"); + // \x5c is now part of a 2-byte kanji character, not a backslash + // this should not make a difference in the outcome + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "John\\\x83\x5cSmith"); + + // code extension with multi-byte encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87"); + // delimiter-like byte in a multi-byte string + studyDescr->putString("Smith=\x1b$BK\\x1b(J"); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "Smith=\x1b$BK\\x1b(J"); +} + +OFTEST(dcmdata_charString_putOFStringAtPos) { + DcmDataset dataset; + DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0); + dataset.insert(studyDescr); + + // single-byte/single-value encoding (Latin1) + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100"); + // \x5c is the backslash character + OFString stringValue; + studyDescr->putString("John\\\x83\x5cSmith"); + OFCHECK(studyDescr->putOFStringAtPos("James", 0).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "James\\\x83\x5cSmith"); + OFCHECK(studyDescr->putOFStringAtPos("H", 1).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "James\\H\\Smith"); + + + // multi-byte/single-value encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030"); + // \x5c is now part of a 2-byte kanji character, not a backslash + studyDescr->putString("John\\\x83\x5cSmith"); + OFCHECK(studyDescr->putOFStringAtPos("James", 0).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "James\\\x83\x5cSmith"); + OFCHECK(studyDescr->putOFStringAtPos("H", 1).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "James\\H"); + studyDescr->putString("John\\\x83\x5cSmith"); + OFCHECK(studyDescr->putOFStringAtPos("Baker", 2).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "John\\\x83\x5cSmith\\Baker"); + + // code extension with multi-byte encoding + dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87"); + // delimiter-like byte in a multi-byte string + studyDescr->putString("Smith=\x1b$BK\\x1b(J"); + OFCHECK(studyDescr->putOFStringAtPos("Doe", 0).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "Doe"); + studyDescr->putString("Smith=\x1b$BK\\x1b(J"); + OFCHECK(studyDescr->putOFStringAtPos("Jane", 1).good()); + OFCHECK(studyDescr->getOFStringArray(stringValue).good()); + OFCHECK_EQUAL(stringValue, "Smith=\x1b$BK\\x1b(J\\Jane"); +} diff --git a/dcmdata/tests/tchval.cc b/dcmdata/tests/tchval.cc index 5e07dd6e44..459d5b662a 100644 --- a/dcmdata/tests/tchval.cc +++ b/dcmdata/tests/tchval.cc @@ -165,9 +165,8 @@ OFTEST(dcmdata_checkStringValue) // maximum length cannot be checked if given in characters (and not bytes) // CHECK_BAD ( "LO-07", DcmLongString::checkStringValueu("OFFIS e.V., Escherweg 2, 26121 Oldenburg, Germany, http://www.offis.de/", "1") ) CHECK_GOOD( "LO-08", DcmLongString::checkStringValue("\\ _2_ \\ _3_ \\ _4_ \\ _5_ \\", "6") ) - // actually, the following test should fail - CHECK_GOOD( "LO-09", DcmLongString::checkStringValue("ESC only allowed for ISO 2022 character set control sequences: \033", "1") ) - CHECK_BAD ( "LO-10", DcmLongString::checkStringValue("also not allowed: \r\014", "1") ) + CHECK_BAD( "LO-09", DcmLongString::checkStringValue("ESC only allowed for charset extension \033", "1") ) + CHECK_BAD ( "LO-10", DcmLongString::checkStringValue("not allowed: \r\014", "1") ) /* test "Long Text" */ CHECK_GOOD( "LT-01", DcmLongText::checkStringValue(" Hello \\ 12345 \\ \344\366\374\337 ", "ISO_IR 100") ) @@ -215,8 +214,7 @@ OFTEST(dcmdata_checkStringValue) CHECK_GOOD( "SH-08", DcmShortString::checkStringValue("\\ _2_ \\ _3_ \\ _4_ \\ _5_ \\", "6") ) CHECK_BAD ( "SH-09", DcmShortString::checkStringValue(" ", "2") ) CHECK_GOOD( "SH-10", DcmShortString::checkStringValue("", "2") ) - // actually, the following test should fail - CHECK_GOOD( "SH-11", DcmShortString::checkStringValue("not allowed: \033", "1") ) + CHECK_BAD ( "SH-11", DcmShortString::checkStringValue("not allowed: \033", "1") ) CHECK_BAD ( "SH-12", DcmShortString::checkStringValue("not allowed: \n\r", "1") ) CHECK_BAD ( "SH-13", DcmShortString::checkStringValue("not allowed: \010\014", "1") ) @@ -248,7 +246,7 @@ OFTEST(dcmdata_checkStringValue) CHECK_GOOD( "UC-01", DcmUnlimitedCharacters::checkStringValue("ABC", "1") ) CHECK_GOOD( "UC-02", DcmUnlimitedCharacters::checkStringValue("ABC\\123", "2") ) CHECK_GOOD( "UC-03", DcmUnlimitedCharacters::checkStringValue(" J\366rg Riesmeier ", "1", "ISO_IR 100") ) - CHECK_GOOD( "UC-04", DcmUnlimitedCharacters::checkStringValue("ESC\033aping", "1") ) + CHECK_BAD( "UC-04", DcmUnlimitedCharacters::checkStringValue("ESC only allowed for charset extension \033", "1") ) CHECK_BAD ( "UC-05", DcmUnlimitedCharacters::checkStringValue("not allowed: \n\010\r\014", "1") ) CHECK_GOOD( "UC-06", DcmUnlimitedCharacters::checkStringValue(" ", "1") ) CHECK_GOOD( "UC-07", DcmUnlimitedCharacters::checkStringValue("A\\B", "2") ) diff --git a/dcmdata/tests/tests.cc b/dcmdata/tests/tests.cc index c8b40d4215..6949096348 100644 --- a/dcmdata/tests/tests.cc +++ b/dcmdata/tests/tests.cc @@ -25,6 +25,11 @@ OFTEST_REGISTER(dcmdata_partialElementAccess); OFTEST_REGISTER(dcmdata_i2d_bmp); +OFTEST_REGISTER(dcmdata_charString_derived_getVM); +OFTEST_REGISTER(dcmdata_charString_getVM_multibyte); +OFTEST_REGISTER(dcmdata_charString_getOFString); +OFTEST_REGISTER(dcmdata_charString_getOFStringArray); +OFTEST_REGISTER(dcmdata_charString_putOFStringAtPos); OFTEST_REGISTER(dcmdata_checkStringValue); OFTEST_REGISTER(dcmdata_determineVM); OFTEST_REGISTER(dcmdata_getValueFromString);