diff --git a/generate_archive.py b/generate_archive.py new file mode 100644 index 0000000..be72fda --- /dev/null +++ b/generate_archive.py @@ -0,0 +1,612 @@ +import zipfile +import json +import os +import re +from datetime import datetime +from urllib.parse import urlparse + +def get_danish_date(timestamp): + dt = datetime.fromtimestamp(timestamp) + return format_danish_date(dt) + +def format_danish_date(dt): + months_da = ['januar', 'februar', 'marts', 'april', 'maj', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', 'december'] + return f"{dt.day}. {months_da[dt.month - 1]} {dt.year}" + +def get_safe_id(mastodon_id_url): + if not mastodon_id_url: + return "unknown" + return mastodon_id_url.rstrip('/').split('/')[-1] + +def generate_html_archive(): + zip_filename = "archive.zip" + html_filename = "index.html" + media_dir = "media" + + if not os.path.exists(zip_filename): + print(f"Fejl: Kunne ikke finde '{zip_filename}' i den nuværende mappe.") + return + + if not os.path.exists(media_dir): + os.makedirs(media_dir) + + try: + zip_mtime = os.path.getmtime(zip_filename) + archive_date = get_danish_date(zip_mtime) + + with zipfile.ZipFile(zip_filename, 'r') as archive: + zip_file_map = {os.path.basename(f): f for f in archive.namelist() if not f.endswith('/')} + + # 1. Hent profiloplysninger + with archive.open('actor.json') as f: + actor = json.load(f) + + # 2. Hent indlæg + with archive.open('outbox.json') as f: + outbox = json.load(f) + + # 3. Udtræk profilbillede + avatar_src = "" + if 'icon' in actor and 'url' in actor['icon']: + avatar_filename_in_zip = actor['icon']['url'] + try: + ext = avatar_filename_in_zip.split('.')[-1].lower() + if ext not in ['png', 'jpg', 'jpeg', 'gif']: + ext = 'png' + + avatar_src = f"avatar.{ext}" + + with archive.open(avatar_filename_in_zip) as f_in: + with open(avatar_src, 'wb') as f_out: + f_out.write(f_in.read()) + except KeyError: + pass + + # Udled brugerdata + display_name = actor.get('name', 'Ukendt') + username = actor.get('preferredUsername', 'ukendt') + profile_url = actor.get('url', '') + parsed_url = urlparse(profile_url) + full_handle = f"@{username}@{parsed_url.netloc}" + bio = actor.get('summary', '') + + # Find profilens oprettelsesdato + profile_published = actor.get('published', '') + profile_created_str = "" + if profile_published: + try: + dt = datetime.fromisoformat(profile_published.replace('Z', '+00:00')) + profile_created_str = f"Oprettet: {format_danish_date(dt)}" + except: + profile_created_str = f"Oprettet: {profile_published}" + + # Klargør data + raw_posts = [] + all_years = set() + tag_counts = {} + year_counts = {} + + # Tællere til indhold og typer + content_counts = {'media': 0, 'links': 0, 'tags': 0, 'no_tags': 0} + + ordered_items = outbox.get('orderedItems', []) + for item in ordered_items: + if item.get('type') == 'Create': + obj = item.get('object', {}) + if isinstance(obj, dict): + raw_posts.append(obj) + + posts = [] + post_count = 0 + reply_count = 0 + quote_count = 0 + + for obj in raw_posts: + obj_id = obj.get('id', '') + safe_id = get_safe_id(obj_id) + raw_content = obj.get('content', '') + + clean_text = re.sub(r'<[^>]+>', '', raw_content).strip() + is_quote = clean_text.startswith('RE:') + is_reply = obj.get('inReplyTo') is not None + + if is_quote: + quote_count += 1 + post_type_label = "Citat" + post_type_class = "citat" + elif is_reply: + reply_count += 1 + post_type_label = "Svar" + post_type_class = "svar" + else: + post_count += 1 + post_type_label = "Post" + post_type_class = "post" + + # Parse dato + published_str = obj.get('published', '') + post_year = "Ukendt" + try: + dt = datetime.fromisoformat(published_str.replace('Z', '+00:00')) + formatted_date = dt.strftime('%d. %b %Y kl. %H:%M') + post_year = str(dt.year) + all_years.add(post_year) + year_counts[post_year] = year_counts.get(post_year, 0) + 1 + except: + formatted_date = published_str + + # Parse tags + post_tags = [] + tags_data = obj.get('tag', []) + for t in tags_data: + if isinstance(t, dict) and t.get('type') == 'Hashtag': + tag_name = t.get('name', '').lower() + if tag_name: + post_tags.append(tag_name) + tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1 + tags_attr = ",".join(post_tags) + + # Tjek om der er eksterne links + has_links_bool = "false" + a_tags = re.findall(r']*>', raw_content) + for a in a_tags: + class_match = re.search(r'class="([^"]*)"', a) + if class_match: + classes = class_match.group(1).split() + if 'mention' not in classes and 'hashtag' not in classes: + has_links_bool = "true" + break + else: + has_links_bool = "true" + break + + # Håndter medier + media_html = "" + attachments = obj.get('attachment', []) + for att in attachments: + att_url = att.get('url', '') + if att_url: + filename = os.path.basename(urlparse(att_url).path) + if filename in zip_file_map: + zip_path = zip_file_map[filename] + local_media_path = os.path.join(media_dir, filename) + try: + with archive.open(zip_path) as f_in, open(local_media_path, 'wb') as f_out: + f_out.write(f_in.read()) + + media_type = att.get('mediaType', '') + if media_type.startswith('image/'): + media_html += f'Vedhæftet billede' + elif media_type.startswith('video/'): + media_html += f'' + except Exception as e: + print(f"Kunne ikke udpakke medie {filename}: {e}") + + if media_html: + media_html = f'
{media_html}
' + + # Sæt boolske attributter til indholdsfiltrering + has_media_bool = "true" if media_html else "false" + has_tags_bool = "true" if post_tags else "false" + + # Opdater tællere for indhold + if has_media_bool == "true": content_counts['media'] += 1 + if has_links_bool == "true": content_counts['links'] += 1 + if has_tags_bool == "true": content_counts['tags'] += 1 + else: content_counts['no_tags'] += 1 + + posts.append({ + 'safe_id': safe_id, + 'content': raw_content, + 'media_html': media_html, + 'date': formatted_date, + 'url': obj.get('url', profile_url), + 'raw_date': published_str, + 'year': post_year, + 'tags_attr': tags_attr, + 'type_label': post_type_label, + 'type_class': post_type_class, + 'has_media': has_media_bool, + 'has_links': has_links_bool, + 'has_tags': has_tags_bool + }) + + posts.sort(key=lambda x: x['raw_date'], reverse=True) + sorted_years = sorted(list(all_years), reverse=True) + sorted_tags = sorted(tag_counts.items(), key=lambda item: (-item[1], item[0])) + + total_posts = len(posts) + + # Byg HTML til år-knapper + years_html = f'\n' + for year in sorted_years: + if year != "Ukendt": + count = year_counts.get(year, 0) + years_html += f'\n' + + # Byg HTML til tag-knapper + tags_html = "" + if sorted_tags: + tags_html += '
' + tags_html += 'Tags:' + tags_html += '
' + tags_html += f'\n' + for tag, count in sorted_tags: + display_tag = tag if tag.startswith('#') else f"#{tag}" + tags_html += f'\n' + tags_html += '
' + + # --- Byg HTML --- + html_content = f""" + + + + + Mastodon Arkiv - {display_name} + + + + +
+ +
+ +

Mastodon Arkiv

+
+ +
+
+
+ Profilbillede +
+

{display_name}

+
{full_handle}
+ {f'
{profile_created_str}
' if profile_created_str else ''} +
{bio}
+
+ {post_count} Posts + {reply_count} Svar + {quote_count} Citater +
+
+
+ +
+

Kontoen er flyttet!

+

Min nye Mastodon instans og profil kan findes her: @aphandersen@andersen.one

+

Dette historiske Mastodon-arkiv blev genereret den {archive_date}.

+
+ +
+""" + + for post in posts: + html_content += f""" +
+
+ Mini profilbillede + +
+
+ {post['content']} +
+ + {post['media_html']} + + +
+""" + + html_content += f""" +
+
+ + +
+
+ + + + +""" + with open(html_filename, 'w', encoding='utf-8') as f: + f.write(html_content) + + print(f"Succes! {html_filename} er nu blevet genereret.") + + except Exception as e: + print(f"Der opstod en fejl under behandlingen af arkivet: {e}") + +if __name__ == "__main__": + generate_html_archive() \ No newline at end of file