import { logError, LoggingCategory, logInfo } from "../../../utils/server/Logger";
import { AmplitudeBrowser } from "..//amplitude/index";
import { keepTextHtmlOnly } from "../../../utils/HtmlParser";

const ZYTE_SECRET = "5696a9fdf1e34f38b4cc28e6af08260e";

async function sendZyteRequest(
  url: string,
  zyteConfig: {
    product?: boolean;
    article?: boolean;
    browserHtml?: boolean;
    httpResponseBody?: boolean;
    articleOptions?: {
      extractFrom: "httpResponseBody" | "browserHtml";
    };
  }
) {
  try {
    const response = await fetch("https://api.zyte.com/v1/extract", {
      method: "POST",
      body: JSON.stringify({
        url,
        ...zyteConfig,
      }),
      headers: {
        "Content-Type": "application/json",
        Authorization: "Basic " + Buffer.from(ZYTE_SECRET + ":").toString("base64"),
      },
    });

    if (response.status === 403) {
      AmplitudeBrowser.track("scraper_event", {
        device_id: "server",
        user_id: null,
        type: "error",
        scraper: "zyte",
        status: response.status,
        statusText: response.statusText,
        url,
      });
      logInfo({
        message: "403 User account suspended. Need to extend average limit.",
        category: LoggingCategory.ZYTE,
        data: {
          response,
        },
      });
      return;
    }

    if (response.status === 451) {
      AmplitudeBrowser.track("scraper_event", {
        device_id: "server",
        user_id: null,
        type: "error",
        scraper: "zyte",
        status: response.status,
        statusText: response.statusText,
        url,
      });
      logInfo({
        message: "451 Unavailable For Legal Reasons",
        category: LoggingCategory.ZYTE,
        data: {
          response,
        },
      });
      return;
    }

    if (!response.ok) {
      AmplitudeBrowser.track("scraper_event", {
        device_id: "server",
        user_id: null,
        type: "ok",
        scraper: "zyte",
        status: response.status,
        url,
      });
      const responseBody = await response.text();
      throw new Error(
        `Network response was not ok. Status: ${response.status} - ${response.statusText}. Body: ${responseBody}`
      );
    }

    const data = await response.json();
    return data;
  } catch (error) {
    logError({
      error,
      message: "Zyte API or Network error occurred",
      reference: "ZyteGateway.sendZyteRequest",
      data: { url, zyteConfig },
      category: LoggingCategory.ZYTE,
    });
    AmplitudeBrowser.track("scraper_event", {
      device_id: "server",
      user_id: null,
      type: "error",
      scraper: "zyte",
      status: "unknown",
      statusText: error.toString(),
      url,
    });
    throw error;
  }
}

/*
 * Designed to find more scrapable versions of known valuable but hard to scrape websites
 */
function rewriteUrl(url: string): string {
  if (url.includes("www.reddit.com")) {
    return url.replace("www.reddit.com", "old.reddit.com");
  }
  return url;
}

/*
 * Uses zyte to scrape server html response (fast compared to browser html)
 * If tryExtractArticle is true, it will try to extract the article from the html
 * It will always remove non-textual html from the response to minimize tokens
 */
async function scrapeHtmlPage(
  url: string,
  config = {
    tryExtractArticle: false,
    browserHtmlRequired: false,
  }
): Promise<{ pageTitle: string; pageHtmlTextOnly: string }> {
  const rewrittenUrl = rewriteUrl(url);
  let zyteData;
  if (config.browserHtmlRequired) {
    zyteData = await sendZyteRequest(rewrittenUrl, {
      browserHtml: true,
      article: config.tryExtractArticle,
      articleOptions: config.tryExtractArticle
        ? {
            extractFrom: "browserHtml",
          }
        : undefined,
    });
  } else {
    zyteData = await sendZyteRequest(rewrittenUrl, {
      httpResponseBody: true,
      article: config.tryExtractArticle,
      articleOptions: config.tryExtractArticle
        ? {
            extractFrom: "httpResponseBody",
          }
        : undefined,
    });
  }
  if (!zyteData) {
    return undefined;
  }
  const zyteResponseBody = Buffer.from(
    zyteData.browserHtml || zyteData.httpResponseBody,
    "base64"
  ).toString("utf-8");
  if (!zyteResponseBody && !config.browserHtmlRequired) {
    // we were trying to use server html, but it was not available
    // try again with browser html
    return scrapeHtmlPage(rewrittenUrl, {
      tryExtractArticle: config.tryExtractArticle,
      browserHtmlRequired: true,
    });
  }
  const articleProbabilityPass = zyteData.article?.metadata.probability > 0.5 || false;
  const articleHtml =
    (articleProbabilityPass && zyteData?.article?.articleBodyHtml) || zyteResponseBody;
  const articleHtmlTextOnly = keepTextHtmlOnly(articleHtml);
  const pageTitle = zyteData?.article?.headline || zyteData?.title || zyteData.url || "";
  console.log(
    `zyte - scrapeHtmlPage - pageTitle=[${pageTitle}] final content length=${articleHtmlTextOnly.length}}`
  );
  return { pageTitle, pageHtmlTextOnly: articleHtmlTextOnly };
}

async function scrapeArticlePage(url: string): Promise<{
  pageTitle: string;
  pageHtmlTextOnly: string;
}> {
  return scrapeHtmlPage(url, { tryExtractArticle: true, browserHtmlRequired: false });
}

async function scrapeProductPage(url: string): Promise<any> {
  return sendZyteRequest(url, { product: true });
}

export const ZyteGateway = {
  __sendZyteRequest: sendZyteRequest,
  scrapeArticlePage,
  scrapeHtmlPage,
  scrapeProductPage,
};
