From 37ca4b9f37ecc42e532b3e72eb7c503f9dc750ed Mon Sep 17 00:00:00 2001
From: mrbean-bremen <hansemrbean@googlemail.com>
Date: Fri, 19 Dec 2025 08:38:25 +0100
Subject: [PATCH] DcmCharString: add some support for multi-byte characters

- add DcmCharString::getVM(), getOFString() and putOFStringAtPos(), which handle multi-byte charsets
- DcmByteString::containsExtendedCharacters():
  add check for ESCAPE characters (only allowed in code extensions)
- removed obsolete DcmCharString::containsExtendedCharacters()
---
 dcmdata/include/dcmtk/dcmdata/dcbytstr.h |  45 +++--
 dcmdata/include/dcmtk/dcmdata/dcchrstr.h |  41 +++-
 dcmdata/include/dcmtk/dcmdata/dcitem.h   |   9 +-
 dcmdata/include/dcmtk/dcmdata/dcsequen.h |   9 +-
 dcmdata/include/dcmtk/dcmdata/dcvrlt.h   |  15 +-
 dcmdata/include/dcmtk/dcmdata/dcvrst.h   |  14 +-
 dcmdata/include/dcmtk/dcmdata/dcvrut.h   |  14 +-
 dcmdata/libsrc/dcbytstr.cc               |  55 ++++--
 dcmdata/libsrc/dcchrstr.cc               | 238 ++++++++++++++++++++++-
 dcmdata/libsrc/dcvrlt.cc                 |   7 -
 dcmdata/libsrc/dcvrst.cc                 |   7 -
 dcmdata/libsrc/dcvrut.cc                 |   7 -
 dcmdata/tests/CMakeLists.txt             |   1 +
 dcmdata/tests/tchrstr.cc                 | 224 +++++++++++++++++++++
 dcmdata/tests/tchval.cc                  |  10 +-
 dcmdata/tests/tests.cc                   |   5 +
 16 files changed, 598 insertions(+), 103 deletions(-)
 create mode 100644 dcmdata/tests/tchrstr.cc

diff --git a/dcmdata/include/dcmtk/dcmdata/dcbytstr.h b/dcmdata/include/dcmtk/dcmdata/dcbytstr.h
index 17a6225820..add6270170 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcbytstr.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcbytstr.h
@@ -274,20 +274,19 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
      */
     virtual OFCondition verify(const OFBool autocorrect = OFFalse);
 
-    /** check if this element contains non-ASCII characters. Please note that this check
-     *  is pretty simple and only works for single-byte character sets that do include
-     *  the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character
-     *  codes below 128 are considered to be ASCII codes and all others are considered to
-     *  be non-ASCII.
+    /** check if this element contains non-ASCII characters.
+     *  This works by checking for any byte values above 127, which works for any
+     *  single-byte code and for single-value multi-byte codes, and for ESC characters,
+     *  which will mean that a code extension is used.
      *  @param checkAllStrings if true, also check elements with string values not affected
      *    by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
-     *    UC and UT, i.e. none of the derived VR classes.
+     *    UC and UT.
      *  @return true if element contains non-ASCII characters, false otherwise
      */
     virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse);
 
     /** check if this element is affected by SpecificCharacterSet
-     *  @return always returns false since none of the derived VR classes is affected by
+     *  @return returns false, overwritten by derived VR classes that are affected by
      *    the SpecificCharacterSet (0008,0005) element
      */
     virtual OFBool isAffectedBySpecificCharacterSet() const;
@@ -379,6 +378,20 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
      */
     virtual OFCondition makeMachineByteString(const Uint32 length = 0);
 
+    /** check if the VR supports more than one value.
+     * @return OFTrue
+     */
+    virtual OFBool supportsMultiValue() const { return OFTrue; }
+
+    /** find the start index of the next component.
+     * @param str pointer to the string value to be searched
+     * @param len the length of @a str
+     * @param start the start character index for the search
+     * @param charSet the value of Specific Character Set; not used
+     * @return the index of the next value, or OFString_npos if none exists.
+     */
+    virtual size_t findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const;
+
     /** convert currently stored string value to DICOM representation.
      *  It removes trailing spaces apart from a possibly required single padding
      *  character (in case of odd string length).
@@ -418,13 +431,23 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
      */
     void setNonSignificantChars(const OFString &characters) { nonSignificantChars = characters; }
 
+    /** set element value at a specific value position in the given character string,
+     * considering the specific character set for finding the position, if given.
+     *  @param stringVal input character string (possibly multi-valued)
+     *  @param pos position (0..vm) where the value should be inserted
+     *  @param charSet the value of the Specific Character Set
+     *  @return status, EC_Normal if successful, an error code otherwise
+     */
+    OFCondition putOFStringAtPosWithCharset(const OFString& stringVal,
+                                            const unsigned long pos,
+                                            const OFString& charSet);
+
     /* --- static helper functions --- */
 
     /** check if a given character string contains non-ASCII characters.
-     *  Please note that this check is pretty simple and only works for single-byte character
-     *  sets that do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other
-     *  words: All character codes below 128 are considered to be ASCII codes and all others
-     *  are considered to be non-ASCII.
+     *  This works by checking for any byte values above 127, which works for any
+     *  single-byte code and for single-value multi-byte codes, and for ESC characters,
+     *  which will mean that a code extension is used.
      *  @param stringVal character string to be checked
      *  @param stringLen length of the string (number of characters without the trailing
      *    NULL byte)
diff --git a/dcmdata/include/dcmtk/dcmdata/dcchrstr.h b/dcmdata/include/dcmtk/dcmdata/dcchrstr.h
index ccc2542212..521c6d1988 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcchrstr.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcchrstr.h
@@ -107,17 +107,31 @@ class DCMTK_DCMDATA_EXPORT DcmCharString
      */
     virtual OFCondition verify(const OFBool autocorrect = OFFalse);
 
-    /** check if this element contains non-ASCII characters. Please note that this check
-     *  is pretty simple and only works for single-byte character sets that do include
-     *  the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character
-     *  codes below 128 are considered to be ASCII codes and all others are considered to
-     *  be non-ASCII.
-     *  @param checkAllStrings not used in this class
-     *  @return true if element contains non-ASCII characters, false otherwise
+    /** get value multiplicity
+     *  @return number of string components (separated by a backslash)
      */
-    virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse);
+    virtual unsigned long getVM();
 
-    /** check if this element is affected by SpecificCharacterSet
+    /** get a copy of a particular string component
+     *  @param stringVal variable in which the result value is stored
+     *  @param pos index of the value in case of multi-valued elements (0..vm-1)
+     *  @param normalize not used since string normalization depends on value representation
+     *  @return status, EC_Normal if successful, an error code otherwise
+     */
+    virtual OFCondition getOFString(OFString &stringVal,
+                                    const unsigned long pos,
+                                    OFBool normalize = OFTrue);
+
+
+    /** set element value at specific VM position in the given character string.
+     *  @param stringVal input character string (possibly multi-valued)
+     *  @param pos position (0..vm) where the value should be inserted
+     *  @return status, EC_Normal if successful, an error code otherwise
+     */
+    virtual OFCondition putOFStringAtPos(const OFString& stringVal,
+                                         const unsigned long pos = 0);
+
+  /** check if this element is affected by SpecificCharacterSet
      *  @return always returns true since all derived VR classes are affected by the
      *    SpecificCharacterSet (0008,0005) element
      */
@@ -169,6 +183,15 @@ class DCMTK_DCMDATA_EXPORT DcmCharString
      */
     virtual const OFString& getDelimiterChars() const;
 
+    /** find the start index of the next value in a multi-valued attribute.
+     * @param str pointer to the string value to be searched
+     * @param len the length of @a str
+     * @param start the start character index for the search
+     * @param charSet the value of Specific Character Set; if not set, single-byte encoding is assumed
+     * @return the index of the next value, or OFString_npos if none exists.
+     */
+    virtual size_t findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const;
+
 };
 
 
diff --git a/dcmdata/include/dcmtk/dcmdata/dcitem.h b/dcmdata/include/dcmtk/dcmdata/dcitem.h
index 84d2b6f3bf..e44b133723 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcitem.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcitem.h
@@ -317,11 +317,10 @@ class DCMTK_DCMDATA_EXPORT DcmItem
      */
     virtual OFBool containsUnknownVR() const;
 
-    /** check if this object contains non-ASCII characters at any nesting level. Please note
-     *  that this check is pretty simple and only works for single-byte character sets that
-     *  do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All
-     *  character codes below 128 are considered to be ASCII codes and all others are
-     *  considered to be non-ASCII.
+    /** check if this object contains non-ASCII characters.
+     *  This works by checking for any byte values above 127, which works for any
+     *  single-byte code and for single-value multi-byte codes, and for ESC characters,
+     *  which will mean that a code extension is used.
      *  @param checkAllStrings if true, also check elements with string values not affected
      *    by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
      *    UC and UT.
diff --git a/dcmdata/include/dcmtk/dcmdata/dcsequen.h b/dcmdata/include/dcmtk/dcmdata/dcsequen.h
index 0c41721fc0..c93d76c6f9 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcsequen.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcsequen.h
@@ -313,11 +313,10 @@ class DCMTK_DCMDATA_EXPORT DcmSequenceOfItems : public DcmElement
      */
     virtual OFBool containsUnknownVR() const;
 
-    /** check if this object contains non-ASCII characters at any nesting level. Please note
-     *  that this check is pretty simple and only works for single-byte character sets that
-     *  do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All
-     *  character codes below 128 are considered to be ASCII codes and all others are
-     *  considered to be non-ASCII.
+    /** check if this object contains non-ASCII characters.
+     *  This works by checking for any byte values above 127, which works for any
+     *  single-byte code and for single-value multi-byte codes, and for ESC characters,
+     *  which will mean that a code extension is used.
      *  @param checkAllStrings if true, also check elements with string values not affected
      *    by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
      *    UC and UT.
diff --git a/dcmdata/include/dcmtk/dcmdata/dcvrlt.h b/dcmdata/include/dcmtk/dcmdata/dcvrlt.h
index 846cada889..ca3e3686b6 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcvrlt.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcvrlt.h
@@ -114,13 +114,6 @@ class DCMTK_DCMDATA_EXPORT DcmLongText
     virtual OFCondition checkValue(const OFString &vm = "",
                                    const OFBool oldFormat = OFFalse);
 
-    /** get the value multiplicity.
-     *  Since the backslash "\" is not regarded as a separator the value
-     *  multiplicity is always 1.
-     *  @return value multiplicity of the currently stored value
-     */
-    virtual unsigned long getVM();
-
     /** get a copy of a particular string component
      *  @param stringVal variable in which the result value is stored
      *  @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -151,6 +144,14 @@ class DCMTK_DCMDATA_EXPORT DcmLongText
      */
     static OFCondition checkStringValue(const OFString &value,
                                         const OFString &charset = "");
+
+  protected:
+    /** check if the VR supports more than one value.
+     *  Since the backslash "\" is not regarded as a separator,
+     *  multiple values cannot be encoded.
+     *  @return OFFalse
+     */
+    virtual OFBool supportsMultiValue() const { return OFFalse; };
 };
 
 
diff --git a/dcmdata/include/dcmtk/dcmdata/dcvrst.h b/dcmdata/include/dcmtk/dcmdata/dcvrst.h
index e012a46910..3b04499e59 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcvrst.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcvrst.h
@@ -115,13 +115,6 @@ class DCMTK_DCMDATA_EXPORT DcmShortText
     virtual OFCondition checkValue(const OFString &vm = "",
                                    const OFBool oldFormat = OFFalse);
 
-    /** get the value multiplicity.
-     *  Since the backslash "\" is not regarded as a separator the value
-     *  multiplicity is always 1.
-     *  @return value multiplicity of the currently stored value
-     */
-    virtual unsigned long getVM();
-
     /** get a copy of a particular string component
      *  @param stringVal variable in which the result value is stored
      *  @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -152,6 +145,13 @@ class DCMTK_DCMDATA_EXPORT DcmShortText
      */
     static OFCondition checkStringValue(const OFString &value,
                                         const OFString &charset = "");
+protected:
+  /** check if the VR supports more than one value.
+   *  Since the backslash "\" is not regarded as a separator,
+   *  multiple values cannot be encoded.
+   *  @return OFFalse
+   */
+  virtual OFBool supportsMultiValue() const { return OFFalse; };
 };
 
 
diff --git a/dcmdata/include/dcmtk/dcmdata/dcvrut.h b/dcmdata/include/dcmtk/dcmdata/dcvrut.h
index 3b8121b9cd..17341197ec 100644
--- a/dcmdata/include/dcmtk/dcmdata/dcvrut.h
+++ b/dcmdata/include/dcmtk/dcmdata/dcvrut.h
@@ -117,13 +117,6 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText
     virtual OFCondition checkValue(const OFString &vm = "",
                                    const OFBool oldFormat = OFFalse);
 
-    /** get the value multiplicity.
-     *  Since the backslash "\" is not regarded as a separator the value
-     *  multiplicity is always 1.
-     *  @return value multiplicity of the currently stored value
-     */
-    virtual unsigned long getVM();
-
     /** get a copy of a particular string component
      *  @param stringVal variable in which the result value is stored
      *  @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -154,6 +147,13 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText
      */
     static OFCondition checkStringValue(const OFString &value,
                                         const OFString &charset = "");
+protected:
+  /** check if the VR supports more than one value.
+   *  Since the backslash "\" is not regarded as a separator,
+   *  multiple values cannot be encoded.
+   *  @return OFFalse
+   */
+  virtual OFBool supportsMultiValue() const { return OFFalse; };
 };
 
 
diff --git a/dcmdata/libsrc/dcbytstr.cc b/dcmdata/libsrc/dcbytstr.cc
index 73f77375fd..92385658f7 100644
--- a/dcmdata/libsrc/dcbytstr.cc
+++ b/dcmdata/libsrc/dcbytstr.cc
@@ -467,9 +467,8 @@ OFCondition DcmByteString::putString(const char *stringVal,
     return errorFlag;
 }
 
-
-OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
-                                            const unsigned long pos)
+OFCondition DcmByteString::putOFStringAtPosWithCharset(const OFString& stringVal, const unsigned long pos,
+    const OFString& charSet)
 {
     OFCondition result;
     // Get old value
@@ -505,8 +504,8 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
             // First value is set: Replace old value with new value
             else
             {
-                rightPos = str.find_first_of('\\', 0);
-                str = str.replace(0, rightPos, stringVal);
+                rightPos = findNextValuePosition(str.c_str(), str.length(), 0, charSet);
+                str = str.replace(0, rightPos - 1, stringVal);
             }
             return putOFStringArray(str);
         }
@@ -514,29 +513,25 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
         // 3rd case: New value should be inserted somewhere in the middle
         size_t leftPos = 0;
         size_t vmPos = 0;
+        size_t strLen = str.length();
         // First, find the correct position, and then insert / replace new value
         do
         {
             // Step from value to value by looking for delimiters.
-            // Special handling first search (start looking at position 0 instead of 1)
-            if (vmPos == 0) leftPos = str.find('\\', 0);
-            else leftPos = str.find('\\', leftPos + 1 );
-            // leftPos = str.find('\\', leftPos == 0 ? 0 : leftPos +1);
+            leftPos = findNextValuePosition(str.c_str(), strLen, leftPos, charSet);
             if (leftPos != OFString_npos)
-            {
                 vmPos++;
-            }
         }
         while ( (leftPos != OFString_npos) && (vmPos != pos) );
-        rightPos = str.find_first_of('\\', leftPos+1);
-        if (rightPos == OFString_npos) rightPos = str.length();
+        rightPos = findNextValuePosition(str.c_str(), strLen, leftPos, charSet);
+        if (rightPos == OFString_npos) rightPos = strLen + 1;
 
         // If we do not have an old value of size 1 or we have an empty value
         if (rightPos - leftPos == 1)
         {
             // Empty value
             if (str.at(leftPos) == '\\')
-                str = str.insert(rightPos, stringVal);
+                str = str.insert(leftPos, stringVal);
             // Old value (length 1)
             else
                 str = str.replace(leftPos, 1, stringVal);
@@ -544,7 +539,7 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
         // Otherwise replace existing old value (length > 1)
         else
         {
-            str = str.replace(leftPos+1, rightPos - leftPos - 1, stringVal);
+            str = str.replace(leftPos, rightPos - leftPos - 1, stringVal);
         }
         // Finally re-insert all values include new value
         result = putOFStringArray( str );
@@ -553,6 +548,28 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
 }
 
 
+
+OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
+                                            const unsigned long pos)
+{
+    return putOFStringAtPosWithCharset(stringVal, pos, "");
+}
+
+
+// ********************************
+
+
+size_t DcmByteString::findNextValuePosition(const char* str, size_t len, size_t start, const OFString& /*charSet*/) const
+{
+    const char *p = str + start;
+    for (size_t i = start; i < len; ++i)
+    {
+        if (*p++ == '\\')
+            return i + 1;
+    }
+    return OFString_npos;
+}
+
 // ********************************
 
 
@@ -770,7 +787,7 @@ OFBool DcmByteString::containsExtendedCharacters(const OFBool checkAllStrings)
     OFBool result = OFFalse;
     /* only check if parameter is true since derived VRs are not affected
        by the attribute SpecificCharacterSet (0008,0005) */
-    if (checkAllStrings)
+    if (checkAllStrings || isAffectedBySpecificCharacterSet())
     {
         char *str = NULL;
         Uint32 len = 0;
@@ -876,10 +893,10 @@ OFBool DcmByteString::containsExtendedCharacters(const char *stringVal,
 {
     if (stringVal != NULL)
     {
-        for (size_t i = stringLen; i != 0; --i)
+        for (size_t i = stringLen; i != 0; --i, ++stringVal)
         {
-            /* check for 8 bit characters */
-            if (OFstatic_cast(unsigned char, *stringVal++) > 127)
+            /* check for 8 bit and Escape characters */
+            if ((*stringVal & 0x80) != 0 || (*stringVal == 0x1b))
                 return OFTrue;
         }
     }
diff --git a/dcmdata/libsrc/dcchrstr.cc b/dcmdata/libsrc/dcchrstr.cc
index a10d49d99e..0c50961903 100644
--- a/dcmdata/libsrc/dcchrstr.cc
+++ b/dcmdata/libsrc/dcchrstr.cc
@@ -45,6 +45,85 @@
 
 #include "dcmtk/dcmdata/dcchrstr.h"
 
+static unsigned long getMaximumNumberOfValues(const OFString& s, Uint32 len)
+{
+    // a byte representing a backslash may also be part of a multi-byte character,
+    // so the found value may be higher than the real VM
+    unsigned long vm = 1;
+    const char *p = s.c_str();
+    for (size_t i = 0; i < len; i++)
+        if (*p++ == '\\')
+            ++vm;
+    return vm;
+}
+
+static OFCondition getOFStringAtIndex(
+    OFString& stringVal, const unsigned long pos, const char *str, Uint32 len)
+{
+    // works for single-byte encodings
+    const char *p = str;
+    const char *start = str;
+    unsigned long vm = 1;
+    for (size_t i = 0; i < len; i++)
+    {
+        if (*p++ == '\\')
+        {
+            if (pos == vm)
+                start = p;
+            else if (pos + 1 == vm)
+            {
+                stringVal.assign(start, p - 1);
+                return EC_Normal;
+            }
+            ++vm;
+        }
+    }
+    if (pos + 1 == vm)
+    {
+        stringVal.assign(start, str + len);
+        return EC_Normal;
+    }
+    if (pos > 0)
+        return EC_IllegalParameter;
+    stringVal.clear();
+    return EC_Normal;
+}
+
+
+// ********************************
+
+// helper functions dealing with specific character sets
+
+static OFBool isMultiValuedCharacterSet(const OFString& charset)
+{
+    return charset.find('\\') != OFString_npos;
+}
+
+static void skipMultiByteEscapeSequence(const char *&p, size_t &i, size_t len)
+{
+    if ((*p != 0x1b) || (i >= len - 2))
+        return;
+
+    // found an escape sequence, check if it is for a multi-byte encoding
+    ++i;
+    // The escape sequence for the following encodings starts with "$":
+    // ISO 2022 IR 87, ISO 2022 IR 159, ISO 2022 IR 149, ISO 2022 IR 58
+    bool isMultiByte = *++p == '$';
+    if (!isMultiByte && *p == '-')
+    {
+        ++i;
+        isMultiByte = *++p == 'T'; // ISO 2022 IR 166
+    }
+    if (!isMultiByte)
+        return;
+
+    // we are inside a part encoded using a multi-byte extension,
+    // skip until the next escape sequence or the end of the value
+    while (++i < len - 2 && *p++ != 0x1b) {}
+}
+
+// ********************************
+
 
 DcmCharString::DcmCharString(const DcmTag &tag, const Uint32 len)
   : DcmByteString(tag, len)
@@ -140,24 +219,136 @@ OFCondition DcmCharString::verify(const OFBool autocorrect)
 }
 
 
-OFBool DcmCharString::containsExtendedCharacters(const OFBool /*checkAllStrings*/)
+// ********************************
+
+
+unsigned long DcmCharString::getVM()
 {
-    OFBool result = OFFalse;
+    // the vast majority of values have VM 0 or 1, so optimize for these
     char *str = NULL;
     Uint32 len = 0;
-    /* determine length in order to support possibly embedded NULL bytes */
-    if (getString(str, len).good())
-        result = DcmByteString::containsExtendedCharacters(str, len);
-    return result;
+    OFCondition result = getString(str, len);
+    if (!result.good() || (str == NULL) || (len == 0))
+        return 0;
+
+    if (!supportsMultiValue())
+        return 1;
+
+    unsigned long vm = getMaximumNumberOfValues(str, len);
+    if (vm == 1 || !containsExtendedCharacters())
+        return vm;
+
+    // We have a string containing extended characters and possibly backslashes -
+    // now we have to get the Specific Character Set to filter out bytes with the
+    // value for backslash (0x5C) that are part of a multi-byte character.
+    OFString charset;
+    result = getSpecificCharacterSet(charset);
+    if (!result.good() || charset.empty())
+        return vm;
+
+    if (isMultiValuedCharacterSet(charset) ||
+        DcmSpecificCharacterSet::isNonASCIIConformMultiByteSingleValueCharacterSet(charset))
+    {
+        vm = 1;
+        size_t startPos = 0;
+        size_t valuePos;
+        while ((valuePos = findNextValuePosition(str, len, startPos, charset)) != OFString_npos)
+        {
+            ++vm;
+            startPos += valuePos;
+        }
+    }
+
+    return vm;
+}
+
+
+// ********************************
+
+
+OFCondition DcmCharString::getOFString(OFString& stringVal, const unsigned long pos, OFBool /*normalize*/)
+{
+    char *str = NULL;
+    Uint32 len = 0;
+    OFCondition result = getString(str, len);
+    if (result.bad())
+        return result;
+
+    if ((str == NULL) || (len == 0))
+    {
+        if (pos > 0)
+            return EC_IllegalParameter;
+        stringVal.clear();
+        return EC_Normal;
+    }
+
+    if (!supportsMultiValue() || getMaximumNumberOfValues(str, len) == 0)
+    {
+        if (pos > 0)
+            return EC_IllegalParameter;
+        stringVal.assign(str, str + len);
+        return EC_Normal;
+    }
+
+    // only check for multi-byte character sets if the value contains any non-ASCII characters
+    // oe Escape sequences
+    if (containsExtendedCharacters())
+    {
+        // We have a string containing extended characters and possibly backslashes -
+        // now we have to get the Specific Character Set to filter out bytes with the
+        // value for backslash (0x5C) that are part of a multi-byte character.
+        OFString charset;
+        result = getSpecificCharacterSet(charset);
+        if (result.good() && !charset.empty() &&
+            (isMultiValuedCharacterSet(charset) ||
+                DcmSpecificCharacterSet::isNonASCIIConformMultiByteSingleValueCharacterSet(charset)))
+        {
+            unsigned long index = 0;
+            size_t valuePos = 0;
+            while (index < pos &&
+                (valuePos = findNextValuePosition(str, len, valuePos, charset)) != OFString_npos)
+                ++index;
+            if (valuePos == OFString_npos)
+                return EC_IllegalParameter;
+            if (valuePos == len)
+                stringVal.clear();
+            else
+            {
+                size_t valueEnd = findNextValuePosition(str, len, valuePos, charset);
+                if (valueEnd == OFString_npos)
+                    valueEnd = len + 1;
+                // account for the backslash before the end pointer
+                stringVal.assign(str + valuePos, str + valueEnd - 1);
+            }
+            return EC_Normal;
+        }
+    }
+    // single-byte, single-value encoding, or value without extended characters
+    return getOFStringAtIndex(stringVal, pos, str, len);
+}
+
+OFCondition DcmCharString::putOFStringAtPos(const OFString& stringVal, const unsigned long pos)
+{
+    OFString charset;
+    if (getSpecificCharacterSet(charset).bad())
+        charset.clear();
+
+    return putOFStringAtPosWithCharset(stringVal, pos, charset);
 }
 
 
+// ********************************
+
+
 OFBool DcmCharString::isAffectedBySpecificCharacterSet() const
 {
     return OFTrue;
 }
 
 
+// ********************************
+
+
 OFCondition DcmCharString::convertCharacterSet(DcmSpecificCharacterSet &converter)
 {
     char *str = NULL;
@@ -274,6 +465,41 @@ const OFString& DcmCharString::getDelimiterChars() const
     return DcmVR(ident()).getDelimiterChars();
 }
 
+size_t DcmCharString::findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const
+{
+    if (charSet.empty())
+        return DcmByteString::findNextValuePosition(str, len, start, charSet);
+
+    const char *p = str + start;
+    if (DcmSpecificCharacterSet::isNonASCIIConformMultiByteSingleValueCharacterSet(charSet))
+    {
+        // special handling to find real backslashes in chinese multi-bytes encodings;
+        // the first byte for 2-byte characters, and the first and third bytes of 4-byte
+        // characters are always > 0x80, so we can exclude these characters
+        for (size_t i = start; i < len; ++i, ++p)
+        {
+            if (*p == '\\')
+                return i + 1;
+            if ((*p & 0x80) != 0)
+            {
+                // this is a 2-byte character or the first or second part
+                // of a 4-byte character - skip the next byte
+                ++p;
+                ++i;
+            }
+        }
+        return OFString_npos;
+    }
+
+    for (size_t i = start; i < len; ++i, ++p)
+    {
+        if (*p == '\\')
+            return i + 1;
+        skipMultiByteEscapeSequence(p, i, len);
+    }
+
+    return OFString_npos;
+}
 
 OFBool DcmCharString::isUniversalMatch(const OFBool normalize,
                                        const OFBool enableWildCardMatching)
diff --git a/dcmdata/libsrc/dcvrlt.cc b/dcmdata/libsrc/dcvrlt.cc
index 140703bcd6..3f91ad03e5 100644
--- a/dcmdata/libsrc/dcvrlt.cc
+++ b/dcmdata/libsrc/dcvrlt.cc
@@ -126,13 +126,6 @@ OFCondition DcmLongText::checkValue(const OFString & /*vm*/,
 }
 
 
-unsigned long DcmLongText::getVM()
-{
-    /* value multiplicity is 1 for non-empty string, 0 otherwise */
-    return (getRealLength() > 0) ? 1 : 0;
-}
-
-
 // ********************************
 
 
diff --git a/dcmdata/libsrc/dcvrst.cc b/dcmdata/libsrc/dcvrst.cc
index 7d17f5876d..4b4974eb81 100644
--- a/dcmdata/libsrc/dcvrst.cc
+++ b/dcmdata/libsrc/dcvrst.cc
@@ -126,13 +126,6 @@ OFCondition DcmShortText::checkValue(const OFString & /*vm*/,
 }
 
 
-unsigned long DcmShortText::getVM()
-{
-    /* value multiplicity is 1 for non-empty string, 0 otherwise */
-    return (getRealLength() > 0) ? 1 : 0;
-}
-
-
 // ********************************
 
 
diff --git a/dcmdata/libsrc/dcvrut.cc b/dcmdata/libsrc/dcvrut.cc
index 0f0a6d048d..0c8c18075e 100644
--- a/dcmdata/libsrc/dcvrut.cc
+++ b/dcmdata/libsrc/dcvrut.cc
@@ -127,13 +127,6 @@ OFCondition DcmUnlimitedText::checkValue(const OFString & /*vm*/,
 }
 
 
-unsigned long DcmUnlimitedText::getVM()
-{
-    /* value multiplicity is 1 for non-empty string, 0 otherwise */
-    return (getRealLength() > 0) ? 1 : 0;
-}
-
-
 // ********************************
 
 
diff --git a/dcmdata/tests/CMakeLists.txt b/dcmdata/tests/CMakeLists.txt
index 98ea562ea7..a1dae78fe6 100644
--- a/dcmdata/tests/CMakeLists.txt
+++ b/dcmdata/tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 # declare executables
 DCMTK_ADD_TEST_EXECUTABLE(dcmdata_tests
   tbytestr.cc
+  tchrstr.cc
   tchval.cc
   tdict.cc
   telemlen.cc
diff --git a/dcmdata/tests/tchrstr.cc b/dcmdata/tests/tchrstr.cc
new file mode 100644
index 0000000000..14e09d7623
--- /dev/null
+++ b/dcmdata/tests/tchrstr.cc
@@ -0,0 +1,224 @@
+/*
+ *
+ *  Copyright (C) 2025, OFFIS e.V.
+ *  All rights reserved.  See COPYRIGHT file for details.
+ *
+ *  This software and supporting documentation were developed by
+ *
+ *    OFFIS e.V.
+ *    R&D Division Health
+ *    Escherweg 2
+ *    D-26121 Oldenburg, Germany
+ *
+ *
+ *  Module:  dcmdata
+ *
+ *  Purpose: test program for DcmCharString and derived classes
+ *
+ */
+
+
+#include <dcmtk/dcmdata/dcvrpn.h>
+#include <dcmtk/dcmdata/dcvrsh.h>
+#include <dcmtk/dcmdata/dcvrst.h>
+#include <dcmtk/dcmdata/dcvruc.h>
+#include <dcmtk/dcmdata/dcvrut.h>
+
+#include "dcmtk/config/osconfig.h"    /* make sure OS specific configuration is included first */
+
+#include "dcmtk/ofstd/oftest.h"
+#include "dcmtk/dcmdata/dcdatset.h"
+#include "dcmtk/dcmdata/dcchrstr.h"
+#include "dcmtk/dcmdata/dcdeftag.h"
+#include "dcmtk/dcmdata/dcvrlo.h"
+#include "dcmtk/dcmdata/dcvrlt.h"
+
+
+OFTEST(dcmdata_charString_derived_getVM)
+{
+    // backslashes are not delimiters in LT, ST and UT,
+    // but are in SH, LO, UC and PN
+    DcmDataset dataset;
+    DcmLongString* longString = new DcmLongString(DCM_StudyDescription, 0);
+    dataset.insert(longString);
+    OFCHECK_EQUAL(longString->getVM(), 0);
+    longString->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(longString->getVM(), 3);
+
+    DcmShortString* shortString = new DcmShortString(DCM_AccessionNumber, 0);
+    dataset.insert(shortString);
+    OFCHECK_EQUAL(shortString->getVM(), 0);
+    shortString->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(shortString->getVM(), 3);
+
+    DcmUnlimitedCharacters* unlimitedChars = new DcmUnlimitedCharacters(DCM_GeneticModificationsDescription, 0);
+    dataset.insert(unlimitedChars);
+    OFCHECK_EQUAL(unlimitedChars->getVM(), 0);
+    unlimitedChars->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(unlimitedChars->getVM(), 3);
+
+    DcmPersonName* personName = new DcmPersonName(DCM_PatientName, 0);
+    dataset.insert(personName);
+    OFCHECK_EQUAL(personName->getVM(), 0);
+    personName->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(personName->getVM(), 3);
+
+    DcmLongText* longText = new DcmLongText(DCM_InventoryPurpose, 0);
+    dataset.insert(longText);
+    OFCHECK_EQUAL(0, longText->getVM());
+    longText->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(1, longText->getVM());
+
+    DcmShortText* shortText = new DcmShortText(DCM_InventoryPurpose, 0);
+    dataset.insert(shortText);
+    OFCHECK_EQUAL(shortText->getVM(), 0);
+    shortText->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(shortText->getVM(), 1);
+
+    DcmUnlimitedText* unlimitedText = new DcmUnlimitedText(DCM_StrainAdditionalInformation, 0);
+    dataset.insert(unlimitedText);
+    OFCHECK_EQUAL(unlimitedText->getVM(), 0);
+    unlimitedText->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(unlimitedText->getVM(), 1);
+}
+
+OFTEST(dcmdata_charString_getVM_multibyte) {
+    DcmDataset dataset;
+    DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0);
+    dataset.insert(studyDescr);
+
+    // single-byte/single-value encoding (Latin1)
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100");
+    // \x5c is the backslash character
+    studyDescr->putString("Smith\\\x83\x5c");
+    OFCHECK_EQUAL(studyDescr->getVM(), 3);
+
+    // multi-byte/single-value encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030");
+    // \x5c is now part of a 2-byte kanji character, not a backslash
+    OFCHECK_EQUAL(studyDescr->getVM(), 2);
+
+    // single-byte/multi-value encoding (Latin1)
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100\\ISO_IR 126");
+    studyDescr->putString("Dionysios=\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2");
+    OFCHECK_EQUAL(studyDescr->getVM(), 1);
+    // backslash inside a single-byte code extension
+    studyDescr->putString("Dionysios=\x1b\x2d\x46\xc4\xe9\xef\\\xed\xf5\xf3\xe9\xef\xf2");
+    OFCHECK_EQUAL(studyDescr->getVM(), 2);
+
+    // code extension with multi-byte encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87");
+    studyDescr->putString("One\\Two\\Three");
+    OFCHECK_EQUAL(3, studyDescr->getVM());
+    // delimiter-like byte in a multi-byte string
+    studyDescr->putString("Smith=\x1b$BK\\x1b(J");
+    OFCHECK_EQUAL(studyDescr->getVM(), 1);
+}
+
+OFTEST(dcmdata_charString_getOFString) {
+    DcmDataset dataset;
+    DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0);
+    dataset.insert(studyDescr);
+
+    // single-byte/single-value encoding (Latin1)
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100");
+    // \x5c is the backslash character
+    OFString stringValue;
+    studyDescr->putString("John\\\x83\x5cSmith");
+    OFCHECK(studyDescr->getOFString(stringValue, 0).good());
+    OFCHECK_EQUAL(stringValue, "John");
+    OFCHECK(studyDescr->getOFString(stringValue, 1).good());
+    OFCHECK_EQUAL(stringValue, "\x83");
+    OFCHECK(studyDescr->getOFString(stringValue, 2).good());
+    OFCHECK_EQUAL(stringValue, "Smith");
+
+
+    // multi-byte/single-value encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030");
+    // \x5c is now part of a 2-byte kanji character, not a backslash
+    OFCHECK(studyDescr->getOFString(stringValue, 0).good());
+    OFCHECK_EQUAL(stringValue, "John");
+    OFCHECK(studyDescr->getOFString(stringValue, 1).good());
+    OFCHECK_EQUAL(stringValue, "\x83\x5cSmith");
+
+    // code extension with multi-byte encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87");
+    // delimiter-like byte in a multi-byte string
+    studyDescr->putString("Smith=\x1b$BK\\x1b(J");
+    OFCHECK(studyDescr->getOFString(stringValue, 0).good());
+    OFCHECK_EQUAL(stringValue, "Smith=\x1b$BK\\x1b(J");
+}
+
+OFTEST(dcmdata_charString_getOFStringArray) {
+    DcmDataset dataset;
+    DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0);
+    dataset.insert(studyDescr);
+
+    // single-byte/single-value encoding (Latin1)
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100");
+    // \x5c is the backslash character
+    OFString stringValue;
+    studyDescr->putString("John\\\x83\x5cSmith");
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "John\\\x83\x5cSmith");
+
+    // multi-byte/single-value encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030");
+    // \x5c is now part of a 2-byte kanji character, not a backslash
+    // this should not make a difference in the outcome
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "John\\\x83\x5cSmith");
+
+    // code extension with multi-byte encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87");
+    // delimiter-like byte in a multi-byte string
+    studyDescr->putString("Smith=\x1b$BK\\x1b(J");
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "Smith=\x1b$BK\\x1b(J");
+}
+
+OFTEST(dcmdata_charString_putOFStringAtPos) {
+    DcmDataset dataset;
+    DcmLongString* studyDescr = new DcmLongString(DCM_StudyDescription, 0);
+    dataset.insert(studyDescr);
+
+    // single-byte/single-value encoding (Latin1)
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO_IR 100");
+    // \x5c is the backslash character
+    OFString stringValue;
+    studyDescr->putString("John\\\x83\x5cSmith");
+    OFCHECK(studyDescr->putOFStringAtPos("James", 0).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "James\\\x83\x5cSmith");
+    OFCHECK(studyDescr->putOFStringAtPos("H", 1).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "James\\H\\Smith");
+
+
+    // multi-byte/single-value encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "GB18030");
+    // \x5c is now part of a 2-byte kanji character, not a backslash
+    studyDescr->putString("John\\\x83\x5cSmith");
+    OFCHECK(studyDescr->putOFStringAtPos("James", 0).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "James\\\x83\x5cSmith");
+    OFCHECK(studyDescr->putOFStringAtPos("H", 1).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "James\\H");
+    studyDescr->putString("John\\\x83\x5cSmith");
+    OFCHECK(studyDescr->putOFStringAtPos("Baker", 2).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "John\\\x83\x5cSmith\\Baker");
+
+    // code extension with multi-byte encoding
+    dataset.putAndInsertString(DCM_SpecificCharacterSet, "ISO 2022 IR 13\\ISO 2022 IR 87");
+    // delimiter-like byte in a multi-byte string
+    studyDescr->putString("Smith=\x1b$BK\\x1b(J");
+    OFCHECK(studyDescr->putOFStringAtPos("Doe", 0).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "Doe");
+    studyDescr->putString("Smith=\x1b$BK\\x1b(J");
+    OFCHECK(studyDescr->putOFStringAtPos("Jane", 1).good());
+    OFCHECK(studyDescr->getOFStringArray(stringValue).good());
+    OFCHECK_EQUAL(stringValue, "Smith=\x1b$BK\\x1b(J\\Jane");
+}
diff --git a/dcmdata/tests/tchval.cc b/dcmdata/tests/tchval.cc
index 5e07dd6e44..459d5b662a 100644
--- a/dcmdata/tests/tchval.cc
+++ b/dcmdata/tests/tchval.cc
@@ -165,9 +165,8 @@ OFTEST(dcmdata_checkStringValue)
 // maximum length cannot be checked if given in characters (and not bytes)
 //  CHECK_BAD ( "LO-07", DcmLongString::checkStringValueu("OFFIS e.V., Escherweg 2, 26121 Oldenburg, Germany, http://www.offis.de/", "1") )
   CHECK_GOOD( "LO-08", DcmLongString::checkStringValue("\\ _2_ \\ _3_ \\ _4_ \\ _5_ \\", "6") )
-  // actually, the following test should fail
-  CHECK_GOOD( "LO-09", DcmLongString::checkStringValue("ESC only allowed for ISO 2022 character set control sequences: \033", "1") )
-  CHECK_BAD ( "LO-10", DcmLongString::checkStringValue("also not allowed: \r\014", "1") )
+  CHECK_BAD( "LO-09", DcmLongString::checkStringValue("ESC only allowed for charset extension \033", "1") )
+  CHECK_BAD ( "LO-10", DcmLongString::checkStringValue("not allowed: \r\014", "1") )
 
   /* test "Long Text" */
   CHECK_GOOD( "LT-01", DcmLongText::checkStringValue(" Hello \\ 12345 \\ \344\366\374\337 ", "ISO_IR 100") )
@@ -215,8 +214,7 @@ OFTEST(dcmdata_checkStringValue)
   CHECK_GOOD( "SH-08", DcmShortString::checkStringValue("\\ _2_ \\ _3_ \\ _4_ \\ _5_ \\", "6") )
   CHECK_BAD ( "SH-09", DcmShortString::checkStringValue(" ", "2") )
   CHECK_GOOD( "SH-10", DcmShortString::checkStringValue("", "2") )
-  // actually, the following test should fail
-  CHECK_GOOD( "SH-11", DcmShortString::checkStringValue("not allowed: \033", "1") )
+  CHECK_BAD ( "SH-11", DcmShortString::checkStringValue("not allowed: \033", "1") )
   CHECK_BAD ( "SH-12", DcmShortString::checkStringValue("not allowed: \n\r", "1") )
   CHECK_BAD ( "SH-13", DcmShortString::checkStringValue("not allowed: \010\014", "1") )
 
@@ -248,7 +246,7 @@ OFTEST(dcmdata_checkStringValue)
   CHECK_GOOD( "UC-01", DcmUnlimitedCharacters::checkStringValue("ABC", "1") )
   CHECK_GOOD( "UC-02", DcmUnlimitedCharacters::checkStringValue("ABC\\123", "2") )
   CHECK_GOOD( "UC-03", DcmUnlimitedCharacters::checkStringValue(" J\366rg Riesmeier ", "1", "ISO_IR 100") )
-  CHECK_GOOD( "UC-04", DcmUnlimitedCharacters::checkStringValue("ESC\033aping", "1") )
+  CHECK_BAD( "UC-04", DcmUnlimitedCharacters::checkStringValue("ESC only allowed for charset extension \033", "1") )
   CHECK_BAD ( "UC-05", DcmUnlimitedCharacters::checkStringValue("not allowed: \n\010\r\014", "1") )
   CHECK_GOOD( "UC-06", DcmUnlimitedCharacters::checkStringValue(" ", "1") )
   CHECK_GOOD( "UC-07", DcmUnlimitedCharacters::checkStringValue("A\\B", "2") )
diff --git a/dcmdata/tests/tests.cc b/dcmdata/tests/tests.cc
index c8b40d4215..6949096348 100644
--- a/dcmdata/tests/tests.cc
+++ b/dcmdata/tests/tests.cc
@@ -25,6 +25,11 @@
 
 OFTEST_REGISTER(dcmdata_partialElementAccess);
 OFTEST_REGISTER(dcmdata_i2d_bmp);
+OFTEST_REGISTER(dcmdata_charString_derived_getVM);
+OFTEST_REGISTER(dcmdata_charString_getVM_multibyte);
+OFTEST_REGISTER(dcmdata_charString_getOFString);
+OFTEST_REGISTER(dcmdata_charString_getOFStringArray);
+OFTEST_REGISTER(dcmdata_charString_putOFStringAtPos);
 OFTEST_REGISTER(dcmdata_checkStringValue);
 OFTEST_REGISTER(dcmdata_determineVM);
 OFTEST_REGISTER(dcmdata_getValueFromString);