Skip to content

Commit

Permalink
Revert to pepebecker's algorithm, adjust it to work without pinyin-ut…
Browse files Browse the repository at this point in the history
…ils and make pinyin list wrapping optional; Add new tests
  • Loading branch information
T-vK authored and pepebecker committed Nov 1, 2018
1 parent f1f7bbd commit 7223315
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 31 deletions.
59 changes: 30 additions & 29 deletions index.js
Expand Up @@ -6,37 +6,38 @@ const allWords = Object.values(wordsData).reduce((o, i) => o.concat(i), [])

const normalizePinyin = (pinyin) => pinyin.normalize('NFD').replace(/[\u0300-\u036f]/g, '').replace(/(\w)[1-5]/g, '$1').toLowerCase()

const split = (text, everything=false, list=false) => {
const words = []
let previousString = ''
let currentString = ''
for (const [i,char] of text.split('').reverse().entries()) {
currentString = char + currentString
const previousStringNormalized = normalizePinyin(previousString)
const currentStringNormalized = normalizePinyin(currentString)
const isLastIteration = i+1===text.length
const isFirstAndNotLastIteration = text.length>1 && i===0
const currentContainsOnlySpecialChars = /^[^\w]+$/.test(currentStringNormalized)
const previousContainsOnlySpecialChars = /^[^\w]+$/.test(previousStringNormalized)
const currentIsValidWord = allWords.includes(currentStringNormalized)
const previousIsValidWord = allWords.includes(previousStringNormalized)
const currentMatchCount = allWords.filter(word=>word.includes(currentStringNormalized)).length
const previousMatchCount = allWords.filter(word=>word.includes(previousStringNormalized)).length
if (!isFirstAndNotLastIteration && ((previousMatchCount >= 1 && currentMatchCount === 0) || isLastIteration || (previousContainsOnlySpecialChars && !currentContainsOnlySpecialChars))) {
if (everything || currentIsValidWord || previousIsValidWord) {
if (currentIsValidWord || currentContainsOnlySpecialChars) {
words.push(list && currentIsValidWord ? [currentString] : currentString)
} else {
words.push(list && previousIsValidWord ? [previousString] : previousString)
if (isFirstAndNotLastIteration)
words.push(char)
}
}
currentString = char
const split = (text, everything=false, returnAsList=false) => {
const list = []
let wordEnd = text.length
while (wordEnd > 0) {
let count = wordEnd
let wordFound = false
while (count > 0) {
const word = text.substring(wordEnd - count, wordEnd)
if (allWords.includes(normalizePinyin(word))) {
wordFound = true
list.push(returnAsList ? [word] : word)
wordEnd -= (count - 1)
break
}
count--
}
if (!wordFound && everything) {
if (wordEnd === text.length || typeof list[list.length - 1] === 'object' || !returnAsList) {
list.push(text[wordEnd - 1])
}
else if (typeof list[list.length - 1] === 'string') {
if (returnAsList) {
list[list.length - 1] = text[wordEnd - 1] + list[list.length - 1]
} else {
list[list.length - 1] = list[list.length - 1]
list.splice(list.length - 1, 0, text[wordEnd - 1])
}
previousString = currentString
}
}
return words.reverse()
wordEnd --
}
return list.reverse()
}

module.exports = split
15 changes: 13 additions & 2 deletions test/index.js
Expand Up @@ -73,8 +73,8 @@ describe('Split spaced text and keep spaces', () => {

describe('Split spaced and punctuated text and keep spaces as well as punctuation', () => {
it('should split the text into the correct words and punctuation', done => {
const list = ['',' ','bú',' ','huì',' ','shuō',' ','Yīng','wén','.']
split(' bú huì shuō Yīngwén.', true).should.deepEqual(list)
const list = ['',' ','bú',' ','huì',' ','shuō',' ','Yīng','wén','.']
split(' bú huì shuō Yīngwén.', true).should.deepEqual(list)
done()
})
it('should split the text into the correct words and punctuation', done => {
Expand All @@ -83,3 +83,14 @@ describe('Split spaced and punctuated text and keep spaces as well as punctuatio
done()
})
})

describe('Split text containing English words', () => {
it('should split text containing English and Pinyin correctly', done => {
const list = [
'This is ', [ 'ran' ], 'd', [ 'o' ], 'm ', [ 'te' ], 'xt: "',
[ 'wo' ], [ 'de' ], [ 'mao' ], [ 'xi' ], [ 'huan' ], [ 'he' ], [ 'niu' ], [ 'nai' ], '".'
]
split('This is random text: "wodemaoxihuanheniunai".', true, true).should.deepEqual(list)
done()
})
})

0 comments on commit 7223315

Please sign in to comment.