Daily-Papers-Atlas / data /graphml_to_json.py
hesamation's picture
moved data files to new folder, update README
1e6e2b3
raw
history blame contribute delete
4.61 kB
#!/usr/bin/env python
import json
import gzip
import xml.etree.ElementTree as ET
import sys
import os
def graphml_to_json(graphml_file, output_json, compressed_output=None):
"""
Convert a GraphML file to SigmaJS-compatible JSON format
"""
# Parse the GraphML file
print(f"Parsing GraphML file: {graphml_file}")
tree = ET.parse(graphml_file)
root = tree.getroot()
# Define the namespace
ns = {'graphml': 'http://graphml.graphdrawing.org/xmlns'}
# Extract the graph from the GraphML
graph = root.find('graphml:graph', ns)
if graph is None:
# Try without namespace
graph = root.find('graph')
if graph is None:
raise ValueError("Could not find graph element in GraphML file")
# Prepare the JSON structure
sigma_data = {
'nodes': [],
'edges': []
}
print("Processing nodes...")
node_count = 0
# Process nodes
for node in graph.findall('graphml:node', ns) or graph.findall('node'):
node_id = node.get('id')
node_data = {'id': node_id, 'attr': {'colors': {}}}
# Process node attributes
for data in node.findall('graphml:data', ns) or node.findall('data'):
key = data.get('key')
if key == 'label':
node_data['label'] = data.text
elif key == 'x':
node_data['x'] = float(data.text)
elif key == 'y':
node_data['y'] = float(data.text)
elif key == 'size':
node_data['size'] = float(data.text)
elif key == 'r':
# Find g and b values
g_elem = node.find(f'graphml:data[@key="g"]', ns) or node.find(f'data[@key="g"]')
b_elem = node.find(f'graphml:data[@key="b"]', ns) or node.find(f'data[@key="b"]')
if g_elem is not None and b_elem is not None:
node_data['color'] = f"rgb({data.text},{g_elem.text},{b_elem.text})"
elif key == 'type':
node_data['attr']['colors']['type'] = data.text
# Set a default color based on node type
if data.text == 'author':
node_data['color'] = 'rgb(154,150,229)'
elif data.text == 'paper':
node_data['color'] = 'rgb(229,150,154)'
else:
node_data['color'] = 'rgb(150,229,154)'
sigma_data['nodes'].append(node_data)
node_count += 1
print(f"Processed {node_count} nodes")
print("Processing edges...")
edge_count = 0
# Process edges
for edge in graph.findall('graphml:edge', ns) or graph.findall('edge'):
source = edge.get('source')
target = edge.get('target')
edge_data = {
'id': f"e{edge_count}",
'source': source,
'target': target
}
edge_count += 1
# Process edge attributes
for data in edge.findall('graphml:data', ns) or edge.findall('data'):
key = data.get('key')
if key == 'weight':
edge_data['weight'] = float(data.text)
elif key == 'edgelabel':
edge_data['label'] = data.text
sigma_data['edges'].append(edge_data)
print(f"Processed {edge_count} edges")
# Write the JSON file
print(f"Writing JSON to {output_json}")
with open(output_json, 'w') as f:
json.dump(sigma_data, f)
# If compressed output is requested, create a gzipped version
if compressed_output:
print(f"Creating compressed file: {compressed_output}")
with open(output_json, 'rb') as f_in:
data = f_in.read()
# Write gzipped data with proper headers for web
with gzip.open(compressed_output, 'wb', compresslevel=9) as f_out:
f_out.write(data)
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Usage: python graphml_to_json.py <input_graphml> <output_json> [compressed_output]")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
compressed_file = sys.argv[3] if len(sys.argv) > 3 else None
try:
graphml_to_json(input_file, output_file, compressed_file)
print(f"Conversion completed. JSON saved to {output_file}")
if compressed_file:
print(f"Compressed version saved to {compressed_file}")
except Exception as e:
print(f"Error during conversion: {e}")
sys.exit(1)