Spaces:

rrg92
/

hf-community-highlights-parser

Running

File size: 5,212 Bytes

<html>
	<head>
	
		<script type="text/javascript">
			let CURRENT_CONTENT;
			let parser = new DOMParser()
		
			function FormatDiscordMessage(html){
				let dom = parser.parseFromString(html, "text/html");
				CURRENT_CONTENT.dom = dom;
				
				let allChilds = dom.querySelectorAll("body > *");
				
				let FullContent = [];
				
				let buffContent = []
				let buffLinks = []
				let authorName= [];
				let edition;
				
				let flushContent = function(){
					console.log("line break found!");
					
					let AllText = buffContent.join("").trim();
					let links = buffLinks.join(",");
					
					if(!links)
						return;
					
					FullContent.push({
						 text: AllText
						,links
						,authors: authorName.join(",")
						,edition
					});
					
					
					
					buffContent = []
					buffLinks = [];
					authorName = []
				}
				
				for(let c of allChilds){
					let text;
					
					if(c.tagName == "A")
						buffLinks.push(c.href);
						
		
					if(c.tagName.at(0) == 'H'){
					
						let editionMatch = c.textContent.match(/#\d+/g);
						
						if(editionMatch){
							edition = parseInt( editionMatch[0].replace('#',''))
						}
					
						continue;
					}
					
					if(c.classList.contains("mention"))
						authorName.push(c.textContent);
					
					text = c.textContent;
					
					if(text)
						buffContent.push(text); 
						
					if(/\!?\s*\n+/.test(text)){
						flushContent();
					}
				}
				

				// last
				if(buffContent){
					flushContent();
				}
				
				
				return { dom, content: FullContent };
			}
			
			function ProcessPastedMessage(){
				let res = FormatDiscordMessage(CURRENT_CONTENT.html)
				
				CURRENT_CONTENT.result = res;
				
				let out = document.querySelector("#result");
				
				let xDoc = document.implementation.createDocument(null, "highlights");
				let rootDoc = xDoc.querySelector("highlights");
				
				let Stats = {
					total: 0
					,edition: null
				};
				
				Stats.edition = res.content[0].edition;
				
				for(let [i,high] of res.content.entries()){
					let xHigh = xDoc.createElement("highlight");
					
					let xAutor 		= xDoc.createElement("author");
					let xLinks 		= xDoc.createElement("links");
					let xEdition 	= xDoc.createElement("edition");
					let xText 		= xDoc.createElement("text");
					
					xAutor.textContent 		= high.authors
					xLinks.textContent 		= high.links
					xEdition.textContent 	= high.edition
					xText.textContent 		= high.text
					
					xHigh.appendChild(xText);
					xHigh.appendChild(xLinks);
					xHigh.appendChild(xEdition);
					xHigh.appendChild(xAutor);

					
					rootDoc.appendChild(xHigh)
					
					Stats.total++
					
				}
				
				
				document.querySelector("#stats").innerHTML = `Stats: total = ${Stats.total}, edition = ${Stats.edition}`
				
				let serializer = new XMLSerializer();
				out.innerHTML = serializer.serializeToString(xDoc);
			}
			
		
			function ProcessPasted(content){
				
				navigator.clipboard.read(["text/html"])
					.then( async (content) => {
					
						console.log("content", content[0].types)
						
						let contentTypes = content[0].types;
						let plainText = await (await content[0].getType("text/plain")).text();
						let html = null;
						
						if(contentTypes.includes("text/html")){
							htmlContent = await content[0].getType("text/html");
							console.log("html:", htmlContent);
							
							html = await htmlContent.text();
							
							CURRENT_CONTENT = {
								html:  await htmlContent.text()
								,text: plainText
							}
							
						} else {
							console.log("NotContainsHtml");
						}
					

						CURRENT_CONTENT = {
							html
							,text: plainText
						}
						
						setTimeout(ProcessPastedMessage, 100)

					})
				
				return false;
			}
			
			addEventListener("paste", ProcessPasted);
			

		</script>
		<style>
			textarea {
				width: 100%;
			}
			
			.container {
				display: flex;
				flex-direction: row;
			}
			
			.container > div {
				width: 50%;
				height: 70vh;
				padding: 5px;
			}
			
			.container textarea {
				height: 100%;
			}
		</style>
	
	</head>
	<body>
        <p>This is a simple parser of Community Highlights, posted weekly in Huging Face Discord</p>
        <p>Community Highlights is a valuable information. While Hugging Face dont provide an official list (via some API), use that tool to parse and import anywhere</p>
        <p>Use it to transform content into something best to be parsed (for example, to import in some database, or blog)</p>
        <p>Instrunctions></p>
        <ol>
          <li>Open Discord in some Broswer (just opening in browser works)</li>
          <li>Go to desired Community Highlights message. Select all message and Copy</li>
          <li>Paste on Content Field</li>
          <li>Then, a parsed data must be generate in side input, in XML format. That format is better to you import anyhwere</li>
        </ol>
        <p>TODO: JSON Support, API import</p>
		<div>
			<p id="stats"></p>
		</div>
		<div class="container">
			<div>
				<p>Content</p>
				<textarea></textarea>
			</div>
			
			<div>
				<p>XML</p>
				<textarea readonly id="result"></textarea>
			</div>
		</div>

		
		
	
	</body>
</html>