import zipfile import json import os import re from datetime import datetime from urllib.parse import urlparse def get_danish_date(timestamp): dt = datetime.fromtimestamp(timestamp) return format_danish_date(dt) def format_danish_date(dt): months_da = ['januar', 'februar', 'marts', 'april', 'maj', 'juni', 'juli', 'august', 'september', 'oktober', 'november', 'december'] return f"{dt.day}. {months_da[dt.month - 1]} {dt.year}" def get_safe_id(mastodon_id_url): if not mastodon_id_url: return "unknown" return mastodon_id_url.rstrip('/').split('/')[-1] def generate_html_archive(): zip_filename = "archive.zip" html_filename = "index.html" media_dir = "media" if not os.path.exists(zip_filename): print(f"Fejl: Kunne ikke finde '{zip_filename}' i den nuværende mappe.") return if not os.path.exists(media_dir): os.makedirs(media_dir) try: zip_mtime = os.path.getmtime(zip_filename) archive_date = get_danish_date(zip_mtime) with zipfile.ZipFile(zip_filename, 'r') as archive: zip_file_map = {os.path.basename(f): f for f in archive.namelist() if not f.endswith('/')} # 1. Hent profiloplysninger with archive.open('actor.json') as f: actor = json.load(f) # 2. Hent indlæg with archive.open('outbox.json') as f: outbox = json.load(f) # 3. Udtræk profilbillede avatar_src = "" if 'icon' in actor and 'url' in actor['icon']: avatar_filename_in_zip = actor['icon']['url'] try: ext = avatar_filename_in_zip.split('.')[-1].lower() if ext not in ['png', 'jpg', 'jpeg', 'gif']: ext = 'png' avatar_src = f"avatar.{ext}" with archive.open(avatar_filename_in_zip) as f_in: with open(avatar_src, 'wb') as f_out: f_out.write(f_in.read()) except KeyError: pass # Udled brugerdata display_name = actor.get('name', 'Ukendt') username = actor.get('preferredUsername', 'ukendt') profile_url = actor.get('url', '') parsed_url = urlparse(profile_url) full_handle = f"@{username}@{parsed_url.netloc}" bio = actor.get('summary', '') # Find profilens oprettelsesdato profile_published = actor.get('published', '') profile_created_str = "" if profile_published: try: dt = datetime.fromisoformat(profile_published.replace('Z', '+00:00')) profile_created_str = f"Oprettet: {format_danish_date(dt)}" except: profile_created_str = f"Oprettet: {profile_published}" # Klargør data raw_posts = [] all_years = set() tag_counts = {} year_counts = {} # Tællere til indhold og typer content_counts = {'media': 0, 'links': 0, 'tags': 0, 'no_tags': 0} ordered_items = outbox.get('orderedItems', []) for item in ordered_items: if item.get('type') == 'Create': obj = item.get('object', {}) if isinstance(obj, dict): raw_posts.append(obj) posts = [] post_count = 0 reply_count = 0 quote_count = 0 for obj in raw_posts: obj_id = obj.get('id', '') safe_id = get_safe_id(obj_id) raw_content = obj.get('content', '') clean_text = re.sub(r'<[^>]+>', '', raw_content).strip() is_quote = clean_text.startswith('RE:') is_reply = obj.get('inReplyTo') is not None if is_quote: quote_count += 1 post_type_label = "Citat" post_type_class = "citat" elif is_reply: reply_count += 1 post_type_label = "Svar" post_type_class = "svar" else: post_count += 1 post_type_label = "Post" post_type_class = "post" # Parse dato published_str = obj.get('published', '') post_year = "Ukendt" try: dt = datetime.fromisoformat(published_str.replace('Z', '+00:00')) formatted_date = dt.strftime('%d. %b %Y kl. %H:%M') post_year = str(dt.year) all_years.add(post_year) year_counts[post_year] = year_counts.get(post_year, 0) + 1 except: formatted_date = published_str # Parse tags post_tags = [] tags_data = obj.get('tag', []) for t in tags_data: if isinstance(t, dict) and t.get('type') == 'Hashtag': tag_name = t.get('name', '').lower() if tag_name: post_tags.append(tag_name) tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1 tags_attr = ",".join(post_tags) # Tjek om der er eksterne links has_links_bool = "false" a_tags = re.findall(r']*>', raw_content) for a in a_tags: class_match = re.search(r'class="([^"]*)"', a) if class_match: classes = class_match.group(1).split() if 'mention' not in classes and 'hashtag' not in classes: has_links_bool = "true" break else: has_links_bool = "true" break # Håndter medier media_html = "" attachments = obj.get('attachment', []) for att in attachments: att_url = att.get('url', '') if att_url: filename = os.path.basename(urlparse(att_url).path) if filename in zip_file_map: zip_path = zip_file_map[filename] local_media_path = os.path.join(media_dir, filename) try: with archive.open(zip_path) as f_in, open(local_media_path, 'wb') as f_out: f_out.write(f_in.read()) media_type = att.get('mediaType', '') if media_type.startswith('image/'): media_html += f'Vedhæftet billede' elif media_type.startswith('video/'): media_html += f'' except Exception as e: print(f"Kunne ikke udpakke medie {filename}: {e}") if media_html: media_html = f'
{media_html}
' # Sæt boolske attributter til indholdsfiltrering has_media_bool = "true" if media_html else "false" has_tags_bool = "true" if post_tags else "false" # Opdater tællere for indhold if has_media_bool == "true": content_counts['media'] += 1 if has_links_bool == "true": content_counts['links'] += 1 if has_tags_bool == "true": content_counts['tags'] += 1 else: content_counts['no_tags'] += 1 posts.append({ 'safe_id': safe_id, 'content': raw_content, 'media_html': media_html, 'date': formatted_date, 'url': obj.get('url', profile_url), 'raw_date': published_str, 'year': post_year, 'tags_attr': tags_attr, 'type_label': post_type_label, 'type_class': post_type_class, 'has_media': has_media_bool, 'has_links': has_links_bool, 'has_tags': has_tags_bool }) posts.sort(key=lambda x: x['raw_date'], reverse=True) sorted_years = sorted(list(all_years), reverse=True) sorted_tags = sorted(tag_counts.items(), key=lambda item: (-item[1], item[0])) total_posts = len(posts) # Byg HTML til år-knapper years_html = f'\n' for year in sorted_years: if year != "Ukendt": count = year_counts.get(year, 0) years_html += f'\n' # Byg HTML til tag-knapper tags_html = "" if sorted_tags: tags_html += '
' tags_html += 'Tags:' tags_html += '
' tags_html += f'\n' for tag, count in sorted_tags: display_tag = tag if tag.startswith('#') else f"#{tag}" tags_html += f'\n' tags_html += '
' # --- Byg HTML --- html_content = f""" Mastodon Arkiv - {display_name}

Mastodon Arkiv

Profilbillede

{display_name}

{full_handle}
{f'
{profile_created_str}
' if profile_created_str else ''}
{bio}
{post_count} Posts {reply_count} Svar {quote_count} Citater

Kontoen er flyttet!

Min nye Mastodon instans og profil kan findes her: @aphandersen@andersen.one

Dette historiske Mastodon-arkiv blev genereret den {archive_date}.

""" for post in posts: html_content += f"""
Mini profilbillede
{post['content']}
{post['media_html']}
""" html_content += f"""
""" with open(html_filename, 'w', encoding='utf-8') as f: f.write(html_content) print(f"Succes! {html_filename} er nu blevet genereret.") except Exception as e: print(f"Der opstod en fejl under behandlingen af arkivet: {e}") if __name__ == "__main__": generate_html_archive()