Combine non-pinyin chars and update tests

pepebecker · Nov 1, 2018 · c191891 · c191891
1 parent 7223315
commit c191891
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 50 deletions.
diff --git a/index.js b/index.js
@@ -6,8 +6,9 @@ const allWords = Object.values(wordsData).reduce((o, i) => o.concat(i), [])
 
 const normalizePinyin = (pinyin) => pinyin.normalize('NFD').replace(/[\u0300-\u036f]/g, '').replace(/(\w)[1-5]/g, '$1').toLowerCase()
 
-const split = (text, everything=false, returnAsList=false) => {
+const split = (text, everything=false, wrapInList=false) => {
   const list = []
+  let prevWordFound = false
   let wordEnd = text.length
   while (wordEnd > 0) {
     let count = wordEnd
@@ -16,26 +17,24 @@ const split = (text, everything=false, returnAsList=false) => {
       const word = text.substring(wordEnd - count, wordEnd)
       if (allWords.includes(normalizePinyin(word))) {
         wordFound = true
-        list.push(returnAsList ? [word] : word)
+        list.push(wrapInList ? [word] : word)
         wordEnd -= (count - 1)
         break
       }
       count--
     }
     if (!wordFound && everything) {
-      if (wordEnd === text.length || typeof list[list.length - 1] === 'object' || !returnAsList) {
+      const prevIndex = list.length - 1
+      const prevEntry = list[prevIndex]
+      if (wordEnd === text.length || typeof prevEntry === 'object' || prevWordFound) {
         list.push(text[wordEnd - 1])
       }
-      else if (typeof list[list.length - 1] === 'string') {
-        if (returnAsList) {
-          list[list.length - 1] = text[wordEnd - 1] + list[list.length - 1]
-        } else {
-          list[list.length - 1] = list[list.length - 1]
-          list.splice(list.length - 1, 0, text[wordEnd - 1])
-        }
+      else if (typeof prevEntry === 'string') {
+        list[prevIndex] = text[wordEnd - 1] + prevEntry
       }
     }
     wordEnd --
+    prevWordFound = wordFound
   }
   return list.reverse()
 }

diff --git a/test/index.js b/test/index.js
@@ -2,95 +2,167 @@
 
 const split = require('../index')
 
-describe('Split text and keep non-pinyin text', () => {
+// Test case for split(string, false, false)
+
+describe('Split text with non-spaced Pinyin and return Pinyin only', () => {
 	it('should split the text into the correct words', done => {
-		const list = ['wo', 'de', 'mao', 'xi', 'huan', 'he', 'niu', 'nai']
-		split('wodemaoxihuanheniunai').should.deepEqual(list)
+		const list = ['a', 'an', 'Pin', 'yin', 'wo', 'de', 'mao', 'xi', 'huan', 'he', 'niu', 'nai']
+		split('I am 本 and this is Pinyin: "wodemaoxihuanheniunai".').should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = ['wo3', 'de', 'mao1', 'xi3', 'huan1', 'he2', 'niu3', 'nai3']
-		split('wo3demao1xi3huan1he2niu3nai3').should.deepEqual(list)
+		const list = ['a', 'an', 'Pin', 'yin', 'wo3', 'de', 'mao1', 'xi3', 'huan', 'he1', 'niu2', 'nai3']
+		split('I am 本 and this is Pinyin: "wo3demao1xi3huanhe1niu2nai3".').should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = ['wǒ', 'de', 'māo', 'xǐ', 'huān', 'hé', 'niǔ', 'nǎi']
-		split('wǒdemāoxǐhuānhéniǔnǎi').should.deepEqual(list)
+		const list = ['a', 'an', 'Pin', 'yin', 'wǒ', 'de', 'māo', 'xǐ', 'huan', 'hē', 'niú', 'nǎi']
+		split('I am 本 and this is Pinyin: "wǒdemāoxǐhuanhēniúnǎi".').should.deepEqual(list)
 		done()
 	})
+})
+
+describe('Split text with spaced Pinyin and return Pinyin only', () => {
 	it('should split the text into the correct words', done => {
-		const list = ['de', 're', 'wo', 'fen', 'dou', 'dou', 'wo', 'ren', 'wei']
-		split('derewofendoudouworenwei').should.deepEqual(list)
+		const list = ['a', 'an', 'Pin', 'yin', 'wo', 'de', 'mao', 'xi', 'huan', 'he', 'niu', 'nai']
+		split('I am 本 and this is Pinyin: "wo de mao xihuan he niunai".').should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = ['de2', 're3', 'wo3', 'fen4', 'dou4', 'dou3', 'wo3', 'ren4', 'wei4']
-		split('de2re3wo3fen4dou4dou3wo3ren4wei4').should.deepEqual(list)
+		const list = ['a', 'an', 'Pin', 'yin', 'wo3', 'de', 'mao1', 'xi3', 'huan', 'he1', 'niu2', 'nai3']
+		split('I am 本 and this is Pinyin: "wo3 de mao1 xi3huan he1 niu2nai3".').should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = ['dé', 'rě', 'wǒ', 'fèn', 'dòu', 'dǒu', 'wǒ', 'rèn', 'wèi']
-		split('dérěwǒfèndòudǒuwǒrènwèi').should.deepEqual(list)
+		const list = ['a', 'an', 'Pin', 'yin', 'wǒ', 'de', 'māo', 'xǐ', 'huan', 'hē', 'niú', 'nǎi']
+		split('I am 本 and this is Pinyin: "wǒ de māo xǐhuan hē niúnǎi".').should.deepEqual(list)
 		done()
 	})
 })
 
-describe('Split text and return Pinyin only', () => {
+// Test case for split(string, true, false)
+
+describe('Split text with non-spaced Pinyin and return everything', () => {
 	it('should split the text into the correct words', done => {
-		const list = ['wo', 'de', 'mao', 'xi', 'huan', 'he', 'niu', 'nai']
-		split('wo de mao xihuan he niunai').should.deepEqual(list)
+		const list = [
+			'I ', 'a', 'm 本 ', 'an', 'd this is ', 'Pin', 'yin', ': "',
+			'wo', 'de', 'mao', 'xi', 'huan', 'he', 'niu', 'nai', '".'
+		]
+		split('I am 本 and this is Pinyin: "wodemaoxihuanheniunai".', true).should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = ['wo3', 'de', 'mao1', 'xi3', 'huan1', 'he2', 'niu3', 'nai3']
-		split('wo3 de mao1 xi3huan1 he2 niu3nai3').should.deepEqual(list)
+		const list = [
+			'I ', 'a', 'm 本 ', 'an', 'd this is ', 'Pin', 'yin', ': "',
+			'wo3', 'de', 'mao1', 'xi3', 'huan', 'he1', 'niu2', 'nai3', '".'
+		]
+		split('I am 本 and this is Pinyin: "wo3demao1xi3huanhe1niu2nai3".', true).should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = ['wǒ', 'de', 'māo', 'xǐ', 'huān', 'hé', 'niǔ', 'nǎi']
-		split('wǒ de māo xǐhuān hé niǔnǎi').should.deepEqual(list)
+		const list = [
+			'I ', 'a', 'm 本 ', 'an', 'd this is ', 'Pin', 'yin', ': "',
+			'wǒ', 'de', 'māo', 'xǐ', 'huan', 'hē', 'niú', 'nǎi', '".'
+		]
+		split('I am 本 and this is Pinyin: "wǒdemāoxǐhuanhēniúnǎi".', true).should.deepEqual(list)
 		done()
 	})
 })
 
-describe('Split spaced text and keep spaces', () => {
+describe('Split text with spaced Pinyin and return everything', () => {
 	it('should split the text into the correct words', done => {
-		const list = [['wo'], ' ', ['de'], ' ', ['mao'], ' ', ['xi'], ['huan'], ' ', ['he'], ' ', ['niu'], ['nai']]
-		split('wo de mao xihuan he niunai', true, true).should.deepEqual(list)
+		const list = [
+			'I ', 'a', 'm 本 ', 'an', 'd this is ', 'Pin', 'yin', ': "',
+			'wo', ' ', 'de', ' ', 'mao', ' ', 'xi', 'huan', ' ', 'he', ' ', 'niu', 'nai', '".'
+		]
+		split('I am 本 and this is Pinyin: "wo de mao xihuan he niunai".', true).should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = [['wo3'], ' ', ['de'], ' ', ['mao1'], ' ', ['xi3'], ['huan'], ' ', ['he2'], ' ', ['niu3'], ['nai3']]
-		split('wo3 de mao1 xi3huan he2 niu3nai3', true, true).should.deepEqual(list)
+		const list = [
+			'I ', 'a', 'm 本 ', 'an', 'd this is ', 'Pin', 'yin', ': "',
+			'wo3', ' ', 'de', ' ', 'mao1', ' ', 'xi3', 'huan', ' ', 'he1', ' ', 'niu2', 'nai3', '".'
+		]
+		split('I am 本 and this is Pinyin: "wo3 de mao1 xi3huan he1 niu2nai3".', true).should.deepEqual(list)
 		done()
 	})
 	it('should split the text into the correct words', done => {
-		const list = [['wǒ'], ' ', ['de'], ' ', ['māo'], ' ', ['xǐ'], ['huan'], ' ', ['hé'], ' ', ['niǔ'], ['nǎi']]
-		split('wǒ de māo xǐhuan hé niǔnǎi', true, true).should.deepEqual(list)
+		const list = [
+			'I ', 'a', 'm 本 ', 'an', 'd this is ', 'Pin', 'yin', ': "',
+			'wǒ', ' ', 'de', ' ', 'māo', ' ', 'xǐ', 'huan', ' ', 'hē', ' ', 'niú', 'nǎi', '".'
+		]
+		split('I am 本 and this is Pinyin: "wǒ de māo xǐhuan hē niúnǎi".', true).should.deepEqual(list)
 		done()
 	})
 })
 
-describe('Split spaced and punctuated text and keep spaces as well as punctuation', () => {
-	it('should split the text into the correct words and punctuation', done => {
-		const list = ['Wǒ',' ','bú',' ','huì',' ','shuō',' ','Yīng','wén','.']
-		split('Wǒ bú huì shuō Yīngwén.', true).should.deepEqual(list)
+// Test case for split(string, true, true)
+
+describe('Split text with non-spaced Pinyin and return everything with Pinyin wrapped in lists', () => {
+	it('should split the text into the correct words', done => {
+		const list = [
+			'I ', ['a'], 'm 本 ', ['an'], 'd this is ', ['Pin'], ['yin'], ': "',
+			['wo'], ['de'], ['mao'], ['xi'], ['huan'], ['he'], ['niu'], ['nai'], '".'
+		]
+		split('I am 本 and this is Pinyin: "wodemaoxihuanheniunai".', true, true).should.deepEqual(list)
 		done()
 	})
-	it('should split the text into the correct words and punctuation', done => {
-		const list = ['Wo3',' ','bu2',' ','hui4',' ','shuo1',' ','Ying1','wen2','.']
-		split('Wo3 bu2 hui4 shuo1 Ying1wen2.', true).should.deepEqual(list)
+	it('should split the text into the correct words', done => {
+		const list = [
+			'I ', ['a'], 'm 本 ', ['an'], 'd this is ', ['Pin'], ['yin'], ': "',
+			['wo3'], ['de'], ['mao1'], ['xi3'], ['huan'], ['he1'], ['niu2'], ['nai3'], '".'
+		]
+		split('I am 本 and this is Pinyin: "wo3demao1xi3huanhe1niu2nai3".', true, true).should.deepEqual(list)
+		done()
+	})
+	it('should split the text into the correct words', done => {
+		const list = [
+			'I ', ['a'], 'm 本 ', ['an'], 'd this is ', ['Pin'], ['yin'], ': "',
+			['wǒ'], ['de'], ['māo'], ['xǐ'], ['huan'], ['hē'], ['niú'], ['nǎi'], '".'
+		]
+		split('I am 本 and this is Pinyin: "wǒdemāoxǐhuanhēniúnǎi".', true, true).should.deepEqual(list)
 		done()
 	})
 })
 
-describe('Split text containing English words', () => {
-	it('should split text containing English and Pinyin correctly', done => {
+describe('Split text with spaced Pinyin and return everything with Pinyin wrapped in lists', () => {
+	it('should split the text into the correct words', done => {
+		const list = [
+			'I ', ['a'], 'm 本 ', ['an'], 'd this is ', ['Pin'], ['yin'], ': "',
+			['wo'], ' ', ['de'], ' ', ['mao'], ' ', ['xi'], ['huan'], ' ', ['he'], ' ', ['niu'], ['nai'], '".'
+		]
+		split('I am 本 and this is Pinyin: "wo de mao xihuan he niunai".', true, true).should.deepEqual(list)
+		done()
+	})
+	it('should split the text into the correct words', done => {
 		const list = [
-            'This is ', [ 'ran' ], 'd', [ 'o' ], 'm ', [ 'te' ], 'xt: "',
-            [ 'wo' ], [ 'de' ], [ 'mao' ], [ 'xi' ], [ 'huan' ], [ 'he' ], [ 'niu' ], [ 'nai' ], '".'
-        ]
-		split('This is random text: "wodemaoxihuanheniunai".', true, true).should.deepEqual(list)
+			'I ', ['a'], 'm 本 ', ['an'], 'd this is ', ['Pin'], ['yin'], ': "',
+			['wo3'], ' ', ['de'], ' ', ['mao1'], ' ', ['xi3'], ['huan'], ' ', ['he1'], ' ', ['niu2'], ['nai3'], '".'
+		]
+		split('I am 本 and this is Pinyin: "wo3 de mao1 xi3huan he1 niu2nai3".', true, true).should.deepEqual(list)
+		done()
+	})
+	it('should split the text into the correct words', done => {
+		const list = [
+			'I ', ['a'], 'm 本 ', ['an'], 'd this is ', ['Pin'], ['yin'], ': "',
+			['wǒ'], ' ', ['de'], ' ', ['māo'], ' ', ['xǐ'], ['huan'], ' ', ['hē'], ' ', ['niú'], ['nǎi'], '".'
+		]
+		split('I am 本 and this is Pinyin: "wǒ de māo xǐhuan hē niúnǎi".', true, true).should.deepEqual(list)
+		done()
+	})
+})
+
+// Extra test case
+
+describe('Split spaced and punctuated text and keep spaces as well as punctuation', () => {
+	it('should split the text into the correct words and punctuation', done => {
+		const list = ['Wǒ', ' ', 'bú', ' ', 'huì', ' ', 'shuō', ' ', 'Yīng', 'wén', '.']
+		split('Wǒ bú huì shuō Yīngwén.', true).should.deepEqual(list)
+		done()
+	})
+	it('should split the text into the correct words and punctuation', done => {
+		const list = ['Wo3', ' ', 'bu2', ' ', 'hui4', ' ', 'shuo1', ' ', 'Ying1', 'wen2', '.']
+		split('Wo3 bu2 hui4 shuo1 Ying1wen2.', true).should.deepEqual(list)
 		done()
 	})
 })