import os
import sys
import csv
import time
import socket
import math
import whois
import tldextract
import requests
from datetime import datetime
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup



sys.path.insert(0, os.path.dirname(__file__))


def application(environ, start_response):
    start_response('200 OK', [('Content-Type', 'text/plain')])
    message = 'It works!\n'
    version = 'Python %s\n' % sys.version.split()[0]
    response = '\n'.join([message, version])
    return [response.encode()]
    
    
    

# Calculate entropy of a string
def calculate_entropy(string):
    prob = [float(string.count(c)) / len(string) for c in set(string)]
    entropy = -sum([p * math.log2(p) for p in prob])
    return entropy

# Extract HTML elements from a URL with retries
def extract_html_elements(url, retries=3):
    for attempt in range(retries):
        try:
            response = requests.get(f'http://{url}', timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            return len(soup.find_all())
        except requests.exceptions.RequestException as e:
            print(f"[Attempt {attempt + 1}] Error fetching {url}: {e}")
            time.sleep(2 ** attempt)
    return 0

# Safe WHOIS lookup with retries
def safe_whois(domain, retries=3):
    for attempt in range(retries):
        try:
            return whois.whois(domain)
        except Exception as e:
            print(f"[Attempt {attempt + 1}] WHOIS error for {domain}: {e}")
            time.sleep(2 ** attempt)
    return None

# Process a single URL
def process_url(url):
    try:
        extracted = tldextract.extract(url)
        domain = extracted.domain
        suffix = extracted.suffix
        full_domain = f"{domain}.{suffix}"

        w = safe_whois(full_domain)
        registrar = w.registrar if w else 'N/A'

        vowels = "aeiouAEIOU"
        n_vowels = sum(1 for char in domain if char in vowels)
        entropy = calculate_entropy(domain)
        html_elements = extract_html_elements(full_domain)

        return {
            'Domain Name': full_domain,
            'Registrar': registrar,
            'Length': len(domain),
            'Num Vowels': n_vowels,
            'Entropy': entropy,
            'Num HTML Elements': html_elements,
            'Error': '' if w else 'WHOIS Lookup Failed'
        }
    except Exception as e:
        return {
            'Domain Name': url,
            'Registrar': 'N/A',
            'Length': 0,
            'Num Vowels': 0,
            'Entropy': 0,
            'Num HTML Elements': 0,
            'Error': str(e)
        }

# Batch processing URLs with limited concurrency
def process_batch(batch, start_count):
    results = []
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(process_url, url) for url in batch]
        for idx, future in enumerate(futures):
            result = future.result()
            if result:
                result['Count'] = start_count + idx
                results.append(result)
    return results

# Process a single file in batches
def process_file(input_file, output_file, batch_size=50000):
    with open(input_file, 'r') as infile:
        reader = csv.DictReader(infile)
        urls = [row['Domain'] for row in reader if 'Domain' in row and row['Domain'].strip()]
    if not urls:
        print(f"No URLs found in {input_file}.")
        return

    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['Count', 'Domain Name', 'Registrar', 'Length', 'Num Vowels', 'Entropy', 'Num HTML Elements', 'Error']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        global_count = 1
        for i in range(0, len(urls), batch_size):
            batch = urls[i:i + batch_size]
            print(f"Processing batch {i // batch_size + 1} of {len(urls) // batch_size + 1} for {input_file}")
            results = process_batch(batch, global_count)
            if results:
                writer.writerows(results)
            global_count += len(batch)
            time.sleep(2)

# Process multiple files
def process_all_chunks(input_dir, output_dir, batch_size=50000):
    os.makedirs(output_dir, exist_ok=True)
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.csv'):
            input_path = os.path.join(input_dir, file_name)
            output_path = os.path.join(output_dir, file_name.replace("chunk_", "result_"))
            print(f"Processing: {input_path}")
            process_file(input_path, output_path, batch_size)
            print(f"Completed: {output_path}")

if __name__ == "__main__":
    input_dir = 'chunks'
    output_dir = 'output'
    process_all_chunks(input_dir, output_dir)
