import os
import whois
import csv
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
import math
import requests
from requests.exceptions import RequestException
import geoip2
import geoip2.database



# sys.path.insert(0, os.path.dirname(__file__))


# def application(environ, start_response):
#     start_response('200 OK', [('Content-Type', 'text/plain')])
#     message = 'It works!\n'
#     version = 'Python %s\n' % sys.version.split()[0]
#     response = '\n'.join([message, version])
#     return [response.encode()]
    
    

# Calculate lexical features for a domain
def calculate_lexical_features(domain):
    vowels = "aeiou"
    numbers = "0123456789"

    n_vowels = sum(1 for char in domain if char.lower() in vowels)
    n_consonants = sum(1 for char in domain if char.isalpha() and char.lower() not in vowels)
    n_nums = sum(1 for char in domain if char in numbers)
    n_other_chars = sum(1 for char in domain if not char.isalnum() and char != '.')
    n_labels = len(domain.split('.'))

    # Calculate entropy
    char_counts = Counter(domain)
    entropy = -sum((count / len(domain)) * math.log2(count / len(domain)) for count in char_counts.values())

    # Simulate number of HTML elements (as a placeholder for actual analysis)
    html_elements = len(domain.split('.')) * 10  # Placeholder logic, adjust as needed

    return {
        'Length': len(domain),
        'Num Vowels': n_vowels,
        'Num Consonants': n_consonants,
        'Num Numbers': n_nums,
        'Num Other Chars': n_other_chars,
        'Entropy': entropy,
        'Num Labels': n_labels,
        'Num HTML Elements': html_elements
    }

# Format the dates properly
def format_date(date):
    if isinstance(date, list):
        date = date[0]  # Use the first date if it's a list
    return date.strftime('%Y-%m-%d') if isinstance(date, datetime) else date

# Get geolocation information using GeoLite2 database
def get_geolocation(ip_address, reader):
    try:
        response = reader.city(ip_address)
        return {
            'Country': response.country.name,
            'City': response.city.name,
            'Latitude': response.location.latitude,
            'Longitude': response.location.longitude
        }
    except (geoip2.errors.AddressNotFoundError, geoip2.errors.GeoIP2Error) as e:
        return {
            'Country': 'N/A',
            'City': 'N/A',
            'Latitude': 'N/A',
            'Longitude': 'N/A'
        }

# Process a single URL with retry logic
def process_url(url, retries=3, backoff_factor=0.3, reader=None):
    for attempt in range(retries):
        try:
            w = whois.whois(url)

            # Handle domain name
            domain_name = w.domain_name
            if isinstance(domain_name, list):
                domain_name = domain_name[0].lower()

            # Format dates
            creation_date = format_date(w.creation_date)
            expiration_date = format_date(w.expiration_date)

            # Format name servers
            name_servers = ', '.join(w.name_servers) if w.name_servers else ''

            # Calculate lexical features
            lexical_features = calculate_lexical_features(domain_name if domain_name else url)

            # Get geolocation information (if reader is provided)
            if reader:
                try:
                    ip_address = requests.get('https://api.ipify.org').text
                    geolocation = get_geolocation(ip_address, reader)
                except (RequestException, Exception):
                    geolocation = {
                        'Country': 'N/A',
                        'City': 'N/A',
                        'Latitude': 'N/A',
                        'Longitude': 'N/A'
                    }
            else:
                geolocation = {
                    'Country': 'N/A',
                    'City': 'N/A',
                    'Latitude': 'N/A',
                    'Longitude': 'N/A'
                }

            return {
                'Domain Name': domain_name if domain_name else url,  # Use domain_name if available; otherwise, fall back to the input URL
                'Registrar': w.registrar if w.registrar else 'N/A',
                'Creation Date': creation_date if creation_date else 'N/A',
                'Expiration Date': expiration_date if expiration_date else 'N/A',
                'Name Servers': name_servers if name_servers else 'N/A',
                'Error': '',  # No error
                **lexical_features,
                **geolocation
            }
        except (RequestException, Exception) as e:
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))  # Exponential backoff
            else:
                # If an error occurs, log the error and retain the input URL as Domain Name
                return {
                    'Domain Name': url,  # Retain the input domain name
                    'Registrar': 'N/A',
                    'Creation Date': 'N/A',
                    'Expiration Date': 'N/A',
                    'Name Servers': 'N/A',
                    'Error': str(e),  # Log the error message
                    'Length': 'N/A',
                    'Num Vowels': 'N/A',
                    'Num Consonants': 'N/A',
                    'Num Numbers': 'N/A',
                    'Num Other Chars': 'N/A',
                    'Entropy': 'N/A',
                    'Num Labels': 'N/A',
                    'Num HTML Elements': 'N/A',
                    'Country': 'N/A',
                    'City': 'N/A',
                    'Latitude': 'N/A',
                    'Longitude': 'N/A'
                }

# Batch processing
def process_batch(batch, start_count, writer, reader=None):
    results = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Reduce the number of threads
        futures = [executor.submit(process_url, url, reader=reader) for url in batch]
        for idx, future in enumerate(futures):
            result = future.result()
            if result:
                # Add the count to each row
                result['Count'] = start_count + idx
                results.append(result)
                writer.writerow(result)  # Write each result immediately
                print(f"Processed {result['Domain Name']} successfully.")
            time.sleep(1)  # Add delay between requests to prevent rate-limiting
    return results

# Main processing logic for a single file
def process_file(input_file, output_file, batch_size=50000, geolite2_database_path=None):
    with open(input_file, 'r') as infile:
        reader = csv.DictReader(infile)
        urls = [row['Domain'] for row in reader if 'Domain' in row and row['Domain'].strip()]

    # Check if URLs were read successfully
    if not urls:
        print(f"No URLs found in {input_file}.")
        return

    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'Count', 'Domain Name', 'Registrar', 'Creation Date', 'Expiration Date',
            'Name Servers', 'Error', 'Length', 'Num Vowels', 'Num Consonants',
            'Num Numbers', 'Num Other Chars', 'Entropy', 'Num Labels', 'Num HTML Elements',
            'Country', 'City', 'Latitude', 'Longitude'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        global_count = 1  # Initialize the global counter
        reader = None
        if geolite2_database_path:
            reader = geoip2.database.Reader(geolite2_database_path)

        for i in range(0, len(urls), batch_size):
            batch = urls[i:i + batch_size]
            print(f"Processing batch {i // batch_size + 1} of {len(urls) // batch_size + 1} for {input_file}")
            results = process_batch(batch, global_count, writer, reader=reader)
            if results:  # Check if there are valid results before writing
                writer.writerows(results)
            else:
                print(f"No valid results for batch {i // batch_size + 1} in {input_file}.")
            global_count += len(batch)