Index: source/i18n/regexcmp.cpp =================================================================== --- source/i18n/regexcmp.cpp (revision 23251) +++ source/i18n/regexcmp.cpp (revision 23418) @@ -2,7 +2,7 @@ // // file: regexcmp.cpp // -// Copyright (C) 2002-2007 International Business Machines Corporation and others. +// Copyright (C) 2002-2008 International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the ICU regular expression compiler, which is responsible @@ -1186,14 +1186,17 @@ // Because capture groups can be forward-referenced by back-references, // we fill the operand with the capture group number. At the end // of compilation, it will be changed to the variable's location. - U_ASSERT(groupNum > 0); - int32_t op; - if (fModeFlags & UREGEX_CASE_INSENSITIVE) { - op = URX_BUILD(URX_BACKREF_I, groupNum); + if (groupNum < 1) { + error(U_REGEX_INVALID_BACK_REF); } else { - op = URX_BUILD(URX_BACKREF, groupNum); + int32_t op; + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { + op = URX_BUILD(URX_BACKREF_I, groupNum); + } else { + op = URX_BUILD(URX_BACKREF, groupNum); + } + fRXPat->fCompiledPat->addElement(op, *fStatus); } - fRXPat->fCompiledPat->addElement(op, *fStatus); } break; Index: source/i18n/rematch.cpp =================================================================== --- source/i18n/rematch.cpp (revision 23251) +++ source/i18n/rematch.cpp (revision 23418) @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2007 International Business Machines Corporation * +* Copyright (C) 2002-2008 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -30,6 +30,15 @@ U_NAMESPACE_BEGIN +// Limit the size of the back track stack, to avoid system failures caused +// by heap exhaustion. Units are in 32 bit words, not bytes. +// This value puts ICU's limits higher than most other regexp implementations, +// which use recursion rather than the heap, and take more storage per +// backtrack point. +// This constant is _temporary_. Proper API to control the value will added. +// +static const int32_t BACKTRACK_STACK_CAPACITY = 8000000; + //----------------------------------------------------------------------------- // // Constructor and Destructor @@ -53,8 +62,9 @@ } if (fStack == NULL || fData == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + } else { + fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); } - reset(RegexStaticSets::gStaticSets->fEmptyString); } @@ -78,6 +88,8 @@ } if (fStack == NULL || fData == NULL) { status = U_MEMORY_ALLOCATION_ERROR; + } else { + fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); } reset(input); } @@ -102,6 +114,8 @@ } if (fStack == NULL || fData == NULL) { status = U_MEMORY_ALLOCATION_ERROR; + } else { + fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); } reset(RegexStaticSets::gStaticSets->fEmptyString); } @@ -1014,6 +1028,14 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) { // push storage for a new frame. int32_t *newFP = fStack->reserveBlock(frameSize, status); + if (newFP == NULL) { + // Heap allocation error on attempted stack expansion. + // We need to return a writable stack frame, so just return the + // previous frame. The match operation will stop quickly + // becuase of the error status, after which the frame will never + // be looked at again. + return fp; + } fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. // New stack frame = copy of old top frame. @@ -1029,8 +1051,8 @@ fp->fPatIdx = savePatIdx; return (REStackFrame *)newFP; } - - + + //-------------------------------------------------------------------------------- // // MatchAt This is the actual matching engine. @@ -2261,6 +2283,7 @@ } if (U_FAILURE(status)) { + isMatch = FALSE; break; } } Index: source/test/intltest/regextst.h =================================================================== --- source/test/intltest/regextst.h (revision 23251) +++ source/test/intltest/regextst.h (revision 23418) @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2007, International Business Machines Corporation and + * Copyright (c) 2002-2008, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -30,6 +30,7 @@ virtual void Extended(); virtual void Errors(); virtual void PerlTests(); + virtual void Bug6149(); // The following functions are internal to the regexp tests. virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line); Index: source/test/intltest/regextst.cpp =================================================================== --- source/test/intltest/regextst.cpp (revision 23251) +++ source/test/intltest/regextst.cpp (revision 23418) @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2007, International Business Machines Corporation and + * Copyright (c) 2002-2008, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -66,6 +66,10 @@ case 6: name = "PerlTests"; if (exec) PerlTests(); break; + case 7: name = "Bug 6149"; + if (exec) Bug6149(); + break; + default: name = ""; @@ -1639,6 +1643,12 @@ // Ticket 5389 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); + + // Invalid Back Reference \0 + // For ICU 3.8 and earlier + // For ICU versions newer than 3.8, \0 introduces an octal escape. + // + REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_INVALID_BACK_REF); } @@ -2122,6 +2132,26 @@ } +//-------------------------------------------------------------- +// +// Bug6149 Verify limits to heap expansion for backtrack stack. +// Use this pattern, +// "(a?){1,}" +// The zero-length match will repeat forever. +// (That this goes into a loop is another bug) +// +//--------------------------------------------------------------- +void RegexTest::Bug6149() { + UnicodeString pattern("(a?){1,}"); + UnicodeString s("xyz"); + uint32_t flags = 0; + UErrorCode status = U_ZERO_ERROR; + + RegexMatcher matcher(pattern, s, flags, status); + UBool result = false; + REGEX_ASSERT_FAIL(result=matcher.matches(status), U_BUFFER_OVERFLOW_ERROR); + REGEX_ASSERT(result == FALSE); + } #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ Index: source/common/uvectr32.cpp =================================================================== --- source/common/uvectr32.cpp (revision 23251) +++ source/common/uvectr32.cpp (revision 23418) @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1999-2003, International Business Machines Corporation and * +* Copyright (C) 1999-2008, International Business Machines Corporation and * * others. All Rights Reserved. * ****************************************************************************** * Date Name Description @@ -26,6 +26,7 @@ UVector32::UVector32(UErrorCode &status) : count(0), capacity(0), + maxCapacity(0), elements(NULL) { _init(DEFUALT_CAPACITY, status); @@ -34,6 +35,7 @@ UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) : count(0), capacity(0), + maxCapacity(0), elements(0) { _init(initialCapacity, status); @@ -46,6 +48,9 @@ if (initialCapacity < 1) { initialCapacity = DEFUALT_CAPACITY; } + if (maxCapacity>0 && maxCapacity= minimumCapacity) { return TRUE; - } else { - int32_t newCap = capacity * 2; - if (newCap < minimumCapacity) { - newCap = minimumCapacity; - } - int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); - if (newElems == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return FALSE; - } - uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); - uprv_free(elements); - elements = newElems; - capacity = newCap; - return TRUE; } + if (maxCapacity>0 && minimumCapacity>maxCapacity) { + status = U_BUFFER_OVERFLOW_ERROR; + return FALSE; + } + int32_t newCap = capacity * 2; + if (newCap < minimumCapacity) { + newCap = minimumCapacity; + } + if (maxCapacity > 0 && newCap > maxCapacity) { + newCap = maxCapacity; + } + int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); + if (newElems == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); + uprv_free(elements); + elements = newElems; + capacity = newCap; + return TRUE; } +void UVector32::setMaxCapacity(int32_t limit) { + U_ASSERT(limit >= 0); + maxCapacity = limit; + if (maxCapacity < 0) { + maxCapacity = 0; + } +} + /** * Change the size of this vector as follows: If newSize is smaller, * then truncate the array, possibly deleting held elements for i >= Index: source/common/uvectr32.h =================================================================== --- source/common/uvectr32.h (revision 23251) +++ source/common/uvectr32.h (revision 23418) @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2006, International Business Machines +* Copyright (C) 1999-2008, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -61,6 +61,8 @@ int32_t count; int32_t capacity; + + int32_t maxCapacity; // Limit beyond which capacity is not permitted to grow. int32_t* elements; @@ -162,6 +164,14 @@ int32_t *getBuffer() const; /** + * Set the maximum allowed buffer capacity for this vector/stack. + * Default with no limit set is unlimited, go until malloc() fails. + * A Limit of zero means unlimited capacity. + * Units are vector elements (32 bits each), not bytes. + */ + void setMaxCapacity(int32_t limit); + + /** * ICU "poor man's RTTI", returns a UClassID for this class. */ static UClassID U_EXPORT2 getStaticClassID(); @@ -221,7 +231,9 @@ } inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) { - ensureCapacity(count+size, status); + if (ensureCapacity(count+size, status) == FALSE) { + return NULL; + } int32_t *rp = elements+count; count += size; return rp;