Spaces:

lintasmediadanawa
/

web_scrape

Sleeping

File size: 2,818 Bytes

e3bbf73
673e694
de04e67
673e694
 
 
1fe8b82
559f4a6
 
346a29c
 
559f4a6
1fe8b82
 
559f4a6
 
 
 
 
 
 
 
673e694
346a29c
de04e67
6acd1bf
 
a270de9
51cbaf5
 
a270de9
346a29c
 
 
 
 
 
 
 
 
 
 
 
633923e
346a29c
 
 
 
4a4fd31
681d84b
 
4a4fd31
 
 
 
 
 
 
2e3c5d0
673e694
681d84b
673e694
 
 
 
b42350d
d4da168
673e694
 
 
d4da168
 
673e694

from typing import Annotated, Optional

from fastapi import FastAPI, Header, Query

import html2text 
import requests

from fastapi.middleware.cors import CORSMiddleware

from bs4 import BeautifulSoup


app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], 
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.get("/google_search")
async def google_search(q: str, delimiter: str = "\n---\n", sites: Annotated[list[str] | None, Query()] = None):
    print(sites)
    print(type(sites))
    url = f"https://www.google.com/search?q={q}"
    if sites:
        url += "&" + " OR ".join(["site:"+site for site in sites]) 
        
    texts = ""
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
  
    for div in soup.find_all("div")[24:]:
      if len(div.find_parents("div")) == 8:  # Depth 4 means 3 parent divs (0-indexed)
          # print(div.get_text().strip())
          href = div.find(href=True, recursive=True)
          text = div.find(text=True, recursive=False)
          if href and text:
            print(text)
            text = f'[{text}]({href["href"].split("/url?q=")[-1]})'
          if text != None and text.strip():
            texts += text + delimiter
    return {"results":texts}


@app.get("/tiktok_details")
async def read_item(username: str, video_id:str):
    # user_agent = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36"
    user_agent = "Googlebot/2.1"
    # if "https:" in link_detail:
    #     url = link_detail
    # elif link_detail[0] == "/":
    #     url = "https://tiktok.com" + link_detail
    # else:
    #     url = "https://tiktok.com/"+link_detail

    url = f"https://tiktok.com/@{username}/video/{video_id}"

    res = requests.get(url, headers={"user-agent":user_agent})
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    text_maker.bypass_tables = False

    print("RESPONSE DETAIlL", res.content.decode("utf-8"))
    
    docs = text_maker.handle(res.content.decode("utf-8"))

    print("DOCS", docs)

    content_detail = docs.split("###")[5]

    likes, comments, bookmarks, shares = re.findall(r'\*\*([\w.]+)\*\*', content_detail)
    
    
    profile = [x.strip() for x in content_detail.split("\n\nSpeed\n\n", 1)[1].split("\n", 6) if x.strip()]
    username = profile[0]
    date = profile[1].rsplit(" · ", 1)[-1]
    desc = profile[-1].replace("**", "")
    
    return {
        "insights":{
            "likeCount":likes,
            "commentCount":comments,
            "bookmarkCount":bookmarks,
            "shareCount":shares
        },
        "username":username,
        "date":date,
        "description":desc
    }