@@ -14309,126 +14309,155 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
1430914309}
1431014310
1431114311/*
14312- Find the longest common leading whitespace among a list of lines.
14313- Whitespace-only lines are ignored.
14314- Returns the margin length (>= 0).
14312+ This function searches the longest common leading whitespace
14313+ of all lines in the [src, end).
14314+ It returns the length of the common leading whitespace and sets *output* to
14315+ point to the beginning of the common leading whitespace if length > 0.
1431514316*/
1431614317static Py_ssize_t
14317- search_longest_common_leading_whitespace (PyObject * lines , Py_ssize_t nlines )
14318- {
14319- PyObject * smallest = NULL , * largest = NULL ;
14320- for (Py_ssize_t i = 0 ; i < nlines ; i ++ ) {
14321- PyObject * line = PyList_GET_ITEM (lines , i );
14322- Py_ssize_t linelen = PyUnicode_GET_LENGTH (line );
14323-
14324- if (linelen == 0 ) {
14325- continue ;
14326- }
14327-
14328- int kind = PyUnicode_KIND (line );
14329- void * data = PyUnicode_DATA (line );
14330- int all_ws = 1 ;
14331- for (Py_ssize_t j = 0 ; j < linelen ; j ++ ) {
14332- if (!Py_UNICODE_ISSPACE (PyUnicode_READ (kind , data , j ))) {
14333- all_ws = 0 ;
14334- break ;
14318+ search_longest_common_leading_whitespace (
14319+ const char * const src ,
14320+ const char * const end ,
14321+ const char * * output )
14322+ {
14323+ // [_start, _start + _len)
14324+ // describes the current longest common leading whitespace
14325+ const char * _start = NULL ;
14326+ Py_ssize_t _len = 0 ;
14327+
14328+ for (const char * iter = src ; iter < end ; ++ iter ) {
14329+ const char * line_start = iter ;
14330+ const char * leading_whitespace_end = NULL ;
14331+
14332+ // scan the whole line
14333+ while (iter < end && * iter != '\n' ) {
14334+ if (!leading_whitespace_end && !Py_ISSPACE (Py_CHARMASK (* iter ))) {
14335+ if (iter == line_start ) {
14336+ // some line has no indent, fast exit!
14337+ return 0 ;
14338+ }
14339+ leading_whitespace_end = iter ;
1433514340 }
14341+ ++ iter ;
1433614342 }
14337- if (all_ws ) {
14343+
14344+ // if this line has all white space, skip it
14345+ if (!leading_whitespace_end ) {
1433814346 continue ;
1433914347 }
1434014348
14341- if (smallest == NULL || PyObject_RichCompareBool (line , smallest , Py_LT )) {
14342- smallest = line ;
14349+ if (!_start ) {
14350+ // update the first leading whitespace
14351+ _start = line_start ;
14352+ _len = leading_whitespace_end - line_start ;
14353+ assert (_len > 0 );
1434314354 }
14344- if (largest == NULL || PyObject_RichCompareBool (line , largest , Py_GT )) {
14345- largest = line ;
14346- }
14347- }
14355+ else {
14356+ /* We then compare with the current longest leading whitespace.
1434814357
14349- if (smallest == NULL || largest == NULL ) {
14350- return 0 ;
14351- }
14358+ [line_start, leading_whitespace_end) is the leading
14359+ whitespace of this line,
1435214360
14353- Py_ssize_t margin = 0 ;
14354- Py_ssize_t minlen = Py_MIN (PyUnicode_GET_LENGTH (smallest ),
14355- PyUnicode_GET_LENGTH (largest ));
14356- int skind = PyUnicode_KIND (smallest );
14357- int lkind = PyUnicode_KIND (largest );
14358- const void * sdata = PyUnicode_DATA (smallest );
14359- const void * ldata = PyUnicode_DATA (largest );
14361+ [_start, _start + _len) is the leading whitespace of the
14362+ current longest leading whitespace. */
14363+ Py_ssize_t new_len = 0 ;
14364+ const char * _iter = _start , * line_iter = line_start ;
1436014365
14361- while (margin < minlen ) {
14362- Py_UCS4 c1 = PyUnicode_READ (skind , sdata , margin );
14363- Py_UCS4 c2 = PyUnicode_READ (lkind , ldata , margin );
14364- if (c1 != c2 || !(c1 == ' ' || c1 == '\t' )) {
14365- break ;
14366+ while (_iter < _start + _len && line_iter < leading_whitespace_end
14367+ && * _iter == * line_iter )
14368+ {
14369+ ++ _iter ;
14370+ ++ line_iter ;
14371+ ++ new_len ;
14372+ }
14373+
14374+ _len = new_len ;
14375+ if (_len == 0 ) {
14376+ // No common things now, fast exit!
14377+ return 0 ;
14378+ }
1436614379 }
14367- margin ++ ;
1436814380 }
1436914381
14370- return margin ;
14382+ assert (_len >= 0 );
14383+ if (_len > 0 ) {
14384+ * output = _start ;
14385+ }
14386+ return _len ;
1437114387}
1437214388
1437314389/* Dedent a string.
14374- Behaviour is expected to be an exact match of ` textwrap.dedent` .
14375- Return a new reference on success, NULL with exception set on error.
14390+ Behaviour is expected to be an exact match of textwrap.dedent.
14391+ Return a new reference on success, NULL with an exception set on error.
1437614392 */
1437714393PyObject *
1437814394_PyUnicode_Dedent (PyObject * unicode )
1437914395{
14380- PyObject * sep = PyUnicode_FromString ("\n" );
14381- if (sep == NULL ) {
14396+ Py_ssize_t src_len = 0 ;
14397+ const char * src = PyUnicode_AsUTF8AndSize (unicode , & src_len );
14398+ if (!src ) {
1438214399 return NULL ;
1438314400 }
14384- PyObject * lines = PyUnicode_Split (unicode , sep , -1 );
14385- Py_DECREF (sep );
14386- if (lines == NULL ) {
14387- return NULL ;
14401+ assert (src_len >= 0 );
14402+ if (src_len == 0 ) {
14403+ return Py_NewRef (unicode );
1438814404 }
14389- Py_ssize_t nlines = PyList_GET_SIZE (lines );
14390- Py_ssize_t margin = search_longest_common_leading_whitespace (lines , nlines );
1439114405
14392- PyUnicodeWriter * writer = PyUnicodeWriter_Create (0 );
14393- if (writer == NULL ) {
14394- Py_DECREF (lines );
14406+ const char * const end = src + src_len ;
14407+
14408+ // [whitespace_start, whitespace_start + whitespace_len)
14409+ // describes the current longest common leading whitespace
14410+ const char * whitespace_start = NULL ;
14411+ const Py_ssize_t whitespace_len = search_longest_common_leading_whitespace (
14412+ src , end , & whitespace_start );
14413+
14414+ // now we should trigger a dedent
14415+ char * dest = PyMem_Malloc (src_len );
14416+ if (!dest ) {
14417+ PyErr_NoMemory ();
1439514418 return NULL ;
1439614419 }
14420+ char * dest_iter = dest ;
1439714421
14398- for (Py_ssize_t i = 0 ; i < nlines ; i ++ ) {
14399- PyObject * line = PyList_GET_ITEM ( lines , i ) ;
14400- Py_ssize_t linelen = PyUnicode_GET_LENGTH ( line ) ;
14422+ for (const char * iter = src ; iter < end ; ++ iter ) {
14423+ const char * line_start = iter ;
14424+ bool in_leading_space = true ;
1440114425
14402- int all_ws = 1 ;
14403- int kind = PyUnicode_KIND (line );
14404- void * data = PyUnicode_DATA (line );
14405- for (Py_ssize_t j = 0 ; j < linelen ; j ++ ) {
14406- if (!Py_UNICODE_ISSPACE (PyUnicode_READ (kind , data , j ))) {
14407- all_ws = 0 ;
14408- break ;
14426+ // iterate over a line to find the end of a line
14427+ while (iter < end && * iter != '\n' ) {
14428+ if (in_leading_space && !Py_ISSPACE (Py_CHARMASK (* iter ))) {
14429+ in_leading_space = false;
1440914430 }
14431+ ++ iter ;
1441014432 }
1441114433
14412- if (!all_ws ) {
14413- Py_ssize_t start = Py_MIN (margin , linelen );
14414- if (PyUnicodeWriter_WriteSubstring (writer , line , start , linelen ) < 0 ) {
14415- PyUnicodeWriter_Discard (writer );
14416- Py_DECREF (lines );
14417- return NULL ;
14434+ // invariant: *iter == '\n' or iter == end
14435+ const bool append_newline = iter < end ;
14436+
14437+ // if this line has all white space, write '\n' and continue
14438+ if (in_leading_space ) {
14439+ if (append_newline ) {
14440+ * dest_iter ++ = '\n' ;
1441814441 }
14442+ continue ;
1441914443 }
1442014444
14421- if (i < nlines - 1 ) {
14422- if (PyUnicodeWriter_WriteChar (writer , '\n' ) < 0 ) {
14423- PyUnicodeWriter_Discard (writer );
14424- Py_DECREF (lines );
14425- return NULL ;
14426- }
14445+ /* copy [new_line_start + whitespace_len, iter) to buffer, then
14446+ conditionally append '\n' */
14447+ const Py_ssize_t new_line_len = iter - line_start - whitespace_len ;
14448+ assert (new_line_len >= 0 );
14449+ memcpy (dest_iter , line_start + whitespace_len , new_line_len );
14450+
14451+ dest_iter += new_line_len ;
14452+
14453+ if (append_newline ) {
14454+ * dest_iter ++ = '\n' ;
1442714455 }
1442814456 }
1442914457
14430- Py_DECREF (lines );
14431- return PyUnicodeWriter_Finish (writer );
14458+ PyObject * res = PyUnicode_FromStringAndSize (dest , dest_iter - dest );
14459+ PyMem_Free (dest );
14460+ return res ;
1443214461}
1443314462
1443414463static PyMethodDef unicode_methods [] = {
0 commit comments