Revert to pepebecker's algorithm, adjust it to work without pinyin-ut…

…ils and make pinyin list wrapping optional; Add new tests
pepebecker · Nov 1, 2018 · 7223315 · 7223315
1 parent f1f7bbd
commit 7223315
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 31 deletions.
diff --git a/index.js b/index.js
@@ -6,37 +6,38 @@ const allWords = Object.values(wordsData).reduce((o, i) => o.concat(i), [])
 
 const normalizePinyin = (pinyin) => pinyin.normalize('NFD').replace(/[\u0300-\u036f]/g, '').replace(/(\w)[1-5]/g, '$1').toLowerCase()
 
-const split = (text, everything=false, list=false) => {
-    const words = []
-    let previousString = ''
-    let currentString = ''
-    for (const [i,char] of text.split('').reverse().entries()) {
-        currentString = char + currentString
-        const previousStringNormalized = normalizePinyin(previousString)
-        const currentStringNormalized = normalizePinyin(currentString)
-        const isLastIteration = i+1===text.length
-        const isFirstAndNotLastIteration = text.length>1 && i===0
-        const currentContainsOnlySpecialChars = /^[^\w]+$/.test(currentStringNormalized)
-        const previousContainsOnlySpecialChars = /^[^\w]+$/.test(previousStringNormalized)
-        const currentIsValidWord = allWords.includes(currentStringNormalized)
-        const previousIsValidWord = allWords.includes(previousStringNormalized)
-        const currentMatchCount = allWords.filter(word=>word.includes(currentStringNormalized)).length
-        const previousMatchCount = allWords.filter(word=>word.includes(previousStringNormalized)).length
-        if (!isFirstAndNotLastIteration && ((previousMatchCount >= 1 && currentMatchCount === 0) || isLastIteration || (previousContainsOnlySpecialChars && !currentContainsOnlySpecialChars))) {
-            if (everything || currentIsValidWord || previousIsValidWord) {
-                if (currentIsValidWord || currentContainsOnlySpecialChars) {
-                    words.push(list && currentIsValidWord ? [currentString] : currentString)
-                } else {
-                    words.push(list && previousIsValidWord ? [previousString] : previousString)
-                    if (isFirstAndNotLastIteration)
-                        words.push(char)
-                }
-            }
-            currentString = char
+const split = (text, everything=false, returnAsList=false) => {
+  const list = []
+  let wordEnd = text.length
+  while (wordEnd > 0) {
+    let count = wordEnd
+    let wordFound = false
+    while (count > 0) {
+      const word = text.substring(wordEnd - count, wordEnd)
+      if (allWords.includes(normalizePinyin(word))) {
+        wordFound = true
+        list.push(returnAsList ? [word] : word)
+        wordEnd -= (count - 1)
+        break
+      }
+      count--
+    }
+    if (!wordFound && everything) {
+      if (wordEnd === text.length || typeof list[list.length - 1] === 'object' || !returnAsList) {
+        list.push(text[wordEnd - 1])
+      }
+      else if (typeof list[list.length - 1] === 'string') {
+        if (returnAsList) {
+          list[list.length - 1] = text[wordEnd - 1] + list[list.length - 1]
+        } else {
+          list[list.length - 1] = list[list.length - 1]
+          list.splice(list.length - 1, 0, text[wordEnd - 1])
         }
-        previousString = currentString
+      }
     }
-    return words.reverse()
+    wordEnd --
+  }
+  return list.reverse()
 }
 
 module.exports = split
diff --git a/test/index.js b/test/index.js
@@ -73,8 +73,8 @@ describe('Split spaced text and keep spaces', () => {
 
 describe('Split spaced and punctuated text and keep spaces as well as punctuation', () => {
 	it('should split the text into the correct words and punctuation', done => {
-		const list = ['Wô',' ','bú',' ','huì',' ','shuō',' ','Yīng','wén','.']
-		split('Wô bú huì shuō Yīngwén.', true).should.deepEqual(list)
+		const list = ['Wǒ',' ','bú',' ','huì',' ','shuō',' ','Yīng','wén','.']
+		split('Wǒ bú huì shuō Yīngwén.', true).should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words and punctuation', done => {
@@ -83,3 +83,14 @@ describe('Split spaced and punctuated text and keep spaces as well as punctuatio
 		done()
 	})
 })
+
+describe('Split text containing English words', () => {
+	it('should split text containing English and Pinyin correctly', done => {
+		const list = [
+            'This is ', [ 'ran' ], 'd', [ 'o' ], 'm ', [ 'te' ], 'xt: "',
+            [ 'wo' ], [ 'de' ], [ 'mao' ], [ 'xi' ], [ 'huan' ], [ 'he' ], [ 'niu' ], [ 'nai' ], '".'
+        ]
+		split('This is random text: "wodemaoxihuanheniunai".', true, true).should.deepEqual(list)
+		done()
+	})
+})