Spaces:

rrg92
/

hf-community-highlights-parser

Running

App Files Files Community

hf-community-highlights-parser / index.html

rrg92

Add some description

2ede869 verified 6 months ago

raw

history blame contribute delete

5.21 kB

	<html>
	<head>

	<script type="text/javascript">
	let CURRENT_CONTENT;
	let parser = new DOMParser()

	function FormatDiscordMessage(html){
	let dom = parser.parseFromString(html, "text/html");
	CURRENT_CONTENT.dom = dom;

	let allChilds = dom.querySelectorAll("body > *");

	let FullContent = [];

	let buffContent = []
	let buffLinks = []
	let authorName= [];
	let edition;

	let flushContent = function(){
	console.log("line break found!");

	let AllText = buffContent.join("").trim();
	let links = buffLinks.join(",");

	if(!links)
	return;

	FullContent.push({
	text: AllText
	,links
	,authors: authorName.join(",")
	,edition
	});



	buffContent = []
	buffLinks = [];
	authorName = []
	}

	for(let c of allChilds){
	let text;

	if(c.tagName == "A")
	buffLinks.push(c.href);


	if(c.tagName.at(0) == 'H'){

	let editionMatch = c.textContent.match(/#\d+/g);

	if(editionMatch){
	edition = parseInt( editionMatch[0].replace('#',''))
	}

	continue;
	}

	if(c.classList.contains("mention"))
	authorName.push(c.textContent);

	text = c.textContent;

	if(text)
	buffContent.push(text);

	if(/\!?\s*\n+/.test(text)){
	flushContent();
	}
	}


	// last
	if(buffContent){
	flushContent();
	}


	return { dom, content: FullContent };
	}

	function ProcessPastedMessage(){
	let res = FormatDiscordMessage(CURRENT_CONTENT.html)

	CURRENT_CONTENT.result = res;

	let out = document.querySelector("#result");

	let xDoc = document.implementation.createDocument(null, "highlights");
	let rootDoc = xDoc.querySelector("highlights");

	let Stats = {
	total: 0
	,edition: null
	};

	Stats.edition = res.content[0].edition;

	for(let [i,high] of res.content.entries()){
	let xHigh = xDoc.createElement("highlight");

	let xAutor = xDoc.createElement("author");
	let xLinks = xDoc.createElement("links");
	let xEdition = xDoc.createElement("edition");
	let xText = xDoc.createElement("text");

	xAutor.textContent = high.authors
	xLinks.textContent = high.links
	xEdition.textContent = high.edition
	xText.textContent = high.text

	xHigh.appendChild(xText);
	xHigh.appendChild(xLinks);
	xHigh.appendChild(xEdition);
	xHigh.appendChild(xAutor);


	rootDoc.appendChild(xHigh)

	Stats.total++

	}


	document.querySelector("#stats").innerHTML = `Stats: total = ${Stats.total}, edition = ${Stats.edition}`

	let serializer = new XMLSerializer();
	out.innerHTML = serializer.serializeToString(xDoc);
	}


	function ProcessPasted(content){

	navigator.clipboard.read(["text/html"])
	.then( async (content) => {

	console.log("content", content[0].types)

	let contentTypes = content[0].types;
	let plainText = await (await content[0].getType("text/plain")).text();
	let html = null;

	if(contentTypes.includes("text/html")){
	htmlContent = await content[0].getType("text/html");
	console.log("html:", htmlContent);

	html = await htmlContent.text();

	CURRENT_CONTENT = {
	html: await htmlContent.text()
	,text: plainText
	}

	} else {
	console.log("NotContainsHtml");
	}


	CURRENT_CONTENT = {
	html
	,text: plainText
	}

	setTimeout(ProcessPastedMessage, 100)

	})

	return false;
	}

	addEventListener("paste", ProcessPasted);


	</script>
	<style>
	textarea {
	width: 100%;
	}

	.container {
	display: flex;
	flex-direction: row;
	}

	.container > div {
	width: 50%;
	height: 70vh;
	padding: 5px;
	}

	.container textarea {
	height: 100%;
	}
	</style>

	</head>
	<body>
	<p>This is a simple parser of Community Highlights, posted weekly in Huging Face Discord</p>
	<p>Community Highlights is a valuable information. While Hugging Face dont provide an official list (via some API), use that tool to parse and import anywhere</p>
	<p>Use it to transform content into something best to be parsed (for example, to import in some database, or blog)</p>
	<p>Instrunctions></p>
	<ol>
	<li>Open Discord in some Broswer (just opening in browser works)</li>
	<li>Go to desired Community Highlights message. Select all message and Copy</li>
	<li>Paste on Content Field</li>
	<li>Then, a parsed data must be generate in side input, in XML format. That format is better to you import anyhwere</li>
	</ol>
	<p>TODO: JSON Support, API import</p>
	<div>
	<p id="stats"></p>
	</div>
	<div class="container">
	<div>
	<p>Content</p>
	<textarea></textarea>
	</div>

	<div>
	<p>XML</p>
	<textarea readonly id="result"></textarea>
	</div>
	</div>




	</body>
	</html>