Spaces:

CatPtain
/

web_ppt

Paused

App Files Files Community

web_ppt / frontend /src /utils /htmlParser /lexer.ts

CatPtain

Upload 339 files

89ce340 verified 5 months ago

raw

history blame

6.29 kB

	import { startsWith, endsWith } from 'lodash'
	import type { Token } from './types'
	import { childlessTags } from './tags'

	interface State {
	str: string
	position: number
	tokens: Token[]
	}

	const jumpPosition = (state: State, end: number) => {
	const len = end - state.position
	movePositopn(state, len)
	}

	const movePositopn = (state: State, len: number) => {
	state.position = state.position + len
	}

	const findTextEnd = (str: string, index: number) => {
	const isEnd = false
	while (!isEnd) {
	const textEnd = str.indexOf('<', index)
	if (textEnd === -1) {
	return textEnd
	}
	const char = str.charAt(textEnd + 1)
	if (char === '/' \|\| char === '!' \|\| /[A-Za-z0-9]/.test(char)) {
	return textEnd
	}
	index = textEnd + 1
	}
	return -1
	}

	const lexText = (state: State) => {
	const { str } = state
	let textEnd = findTextEnd(str, state.position)
	if (textEnd === state.position) return
	if (textEnd === -1) {
	textEnd = str.length
	}

	const content = str.slice(state.position, textEnd)
	jumpPosition(state, textEnd)

	state.tokens.push({
	type: 'text',
	content,
	})
	}

	const lexComment = (state: State) => {
	const { str } = state

	movePositopn(state, 4)
	let contentEnd = str.indexOf('-->', state.position)
	let commentEnd = contentEnd + 3
	if (contentEnd === -1) {
	contentEnd = commentEnd = str.length
	}

	const content = str.slice(state.position, contentEnd)
	jumpPosition(state, commentEnd)

	state.tokens.push({
	type: 'comment',
	content,
	})
	}

	const lexTagName = (state: State) => {
	const { str } = state
	const len = str.length
	let start = state.position

	while (start < len) {
	const char = str.charAt(start)
	const isTagChar = !(/\s/.test(char) \|\| char === '/' \|\| char === '>')
	if (isTagChar) break
	start++
	}

	let end = start + 1
	while (end < len) {
	const char = str.charAt(end)
	const isTagChar = !(/\s/.test(char) \|\| char === '/' \|\| char === '>')
	if (!isTagChar) break
	end++
	}

	jumpPosition(state, end)
	const tagName = str.slice(start, end)
	state.tokens.push({
	type: 'tag',
	content: tagName
	})
	return tagName
	}

	const lexTagAttributes = (state: State) => {
	const { str, tokens } = state
	let cursor = state.position
	let quote = null
	let wordBegin = cursor
	const words = []
	const len = str.length
	while (cursor < len) {
	const char = str.charAt(cursor)
	if (quote) {
	const isQuoteEnd = char === quote
	if (isQuoteEnd) quote = null
	cursor++
	continue
	}

	const isTagEnd = char === '/' \|\| char === '>'
	if (isTagEnd) {
	if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
	break
	}

	const isWordEnd = /\s/.test(char)
	if (isWordEnd) {
	if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
	wordBegin = cursor + 1
	cursor++
	continue
	}

	const isQuoteStart = char === '\'' \|\| char === '"'
	if (isQuoteStart) {
	quote = char
	cursor++
	continue
	}

	cursor++
	}
	jumpPosition(state, cursor)

	const type = 'attribute'
	for (let i = 0; i < words.length; i++) {
	const word = words[i]

	const isNotPair = word.indexOf('=') === -1
	if (isNotPair) {
	const secondWord = words[i + 1]
	if (secondWord && startsWith(secondWord, '=')) {
	if (secondWord.length > 1) {
	const newWord = word + secondWord
	tokens.push({ type, content: newWord })
	i += 1
	continue
	}
	const thirdWord = words[i + 2]
	i += 1
	if (thirdWord) {
	const newWord = word + '=' + thirdWord
	tokens.push({ type, content: newWord })
	i += 1
	continue
	}
	}
	}
	if (endsWith(word, '=')) {
	const secondWord = words[i + 1]
	if (secondWord && secondWord.indexOf('=') === -1) {
	const newWord = word + secondWord
	tokens.push({ type, content: newWord })
	i += 1
	continue
	}

	const newWord = word.slice(0, -1)
	tokens.push({ type, content: newWord })
	continue
	}

	tokens.push({ type, content: word })
	}
	}

	const lexSkipTag = (tagName: string, state: State) => {
	const { str, tokens } = state
	const safeTagName = tagName.toLowerCase()
	const len = str.length
	let index = state.position

	while (index < len) {
	const nextTag = str.indexOf('</', index)
	if (nextTag === -1) {
	lexText(state)
	break
	}

	const tagState = {
	str,
	position: state.position,
	tokens: [],
	}
	jumpPosition(tagState, nextTag)
	const name = lexTag(tagState)
	if (safeTagName !== name.toLowerCase()) {
	index = tagState.position
	continue
	}

	if (nextTag !== state.position) {
	const textStart = state.position
	jumpPosition(state, nextTag)
	tokens.push({
	type: 'text',
	content: str.slice(textStart, nextTag),
	})
	}

	tokens.push(...tagState.tokens)
	jumpPosition(state, tagState.position)
	break
	}
	}

	const lexTag = (state: State) => {
	const { str } = state
	const secondChar = str.charAt(state.position + 1)
	const tagStartClose = secondChar === '/'
	movePositopn(state, tagStartClose ? 2 : 1)
	state.tokens.push({
	type: 'tag-start',
	close: tagStartClose,
	})

	const tagName = lexTagName(state)
	lexTagAttributes(state)

	const firstChar = str.charAt(state.position)
	const tagEndClose = firstChar === '/'
	movePositopn(state, tagEndClose ? 2 : 1)
	state.tokens.push({
	type: 'tag-end',
	close: tagEndClose,
	})
	return tagName
	}

	const lex = (state: State) => {
	const str = state.str
	const len = str.length

	while (state.position < len) {
	const start = state.position
	lexText(state)

	if (state.position === start) {
	const isComment = startsWith(str, '!--', start + 1)
	if (isComment) lexComment(state)
	else {
	const tagName = lexTag(state)
	const safeTag = tagName.toLowerCase()
	if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state)
	}
	}
	}
	}

	export const lexer = (str: string): Token[] => {
	const state = {
	str,
	position: 0,
	tokens: [],
	}
	lex(state)
	return state.tokens
	}