#!/usr/bin/env python
# Pastebin cx665DUo
# SPDX-License-Identifier: CC0-1.0

import re
import os
import sys
import requests
import pandas as pd
from io import StringIO
import html2text as ht
from bs4 import BeautifulSoup, Comment

html2text = ht.HTML2Text()
html2text.ignore_links = False
html2text.images_to_alt = False

def scrap_pr(author, repo, pr_num):
    try:
        os.mkdir(str(pr_num))
    except:
        pass
    req = requests.get(f"https://github.com/{author}/{repo}/pull/{pr_num}")
    html_content = req.content.decode('utf-8')

    soup = BeautifulSoup(html_content, 'html.parser')
    img_num = 0
    for img in soup.find_all('img'):
        if 'avatars.githubusercontent.com' in img['src']:
            continue
        try:
            req_img = requests.get(img['src'])
            req_img_data = req_img.content
            if len(req_img_data) > 2 and req_img_data[0] == '\x89' and req_img_data[1] == '\0x50':
                fn = f"img_{img_num}.png"
            else:
                fn = f"img_{img_num}.jpg"
            with open(f"{pr_num}/{fn}", "wb") as f:
                f.write(req_img.content)
                img['src'] = fn
            img_num = img_num + 1
        except Exception as e:
            raise(e)

    cont = html2text.handle(str(soup))

    with open(str(pr_num) + '/' + 'PR.md', 'w') as f:
        f.write(cont)

    # fetch all commits
    commits_req = requests.get(f"https://github.com/{author}/{repo}/pull/{pr_num}/commits")
    html_content = commits_req.content.decode('utf-8')
    soup = BeautifulSoup(html_content, 'html.parser')
    patch_num = 0
    patches = {}
    for a in soup.find_all('a'):
        if a['href'] in patches:
            continue
        if a['href'].startswith(f"/{author}/{repo}/pull/{pr_num}/commits/"):
            patch_req = requests.get('https://github.com' + a['href'] + '.patch')
            with open(f"{pr_num}/{patch_num}_{os.path.basename(a['href']) + '.patch'}", "wb") as f:
                f.write(patch_req.content)
                patch_num += 1
                patches[a['href']] = 1

scrap_pr('PCBox', 'PCBox', sys.argv[1])
