"A11", 'light_heavyweight' => "A13", 'middleweight' => "A14", 'welterweight' => "A15", 'lightweight' => "A12", 'featherweight' => "A9", 'bantamweight' => "A8", 'flyweight' => "A10", ]; /** * Public constructor for Fighter Importer. * * @param Client $httpClient * @param EntityTypeManager $entityTypeManager * @param Cache $cache */ public function __construct( Client $httpClient, EntityTypeManager $entityTypeManager, CacheBackendInterface $cache ) { $this->httpClient = $httpClient; $this->entityTypeManager = $entityTypeManager; $this->cache = $cache; } public function importFighters(): void { $test_run = FALSE; if ($test_run) { // Overriding to test!!! $fighter_name_text_on_ufc = " Georges St-Pierre "; $fighter = new Fighter($this->httpClient); $fighter->first_name = 'scott'; $fighter->last_name = 'adams'; $fighter->scrapeDataFromFighterPage(); $fighter_clone = clone ($fighter); unset($fighter_clone->fighter_page); unset($fighter_clone->crawler); dump($fighter_clone); } else { $fighters_by_div = self::getListOfCurrentFighters(); // Process each fighter into system. foreach ($fighters_by_div as $division => $fighters) { $this->processDivision($division, $fighters); } } } /** * Process a division. * * @param mixed $div * @param mixed $fighters */ public static function processDivision($div, $fighters): void { \Drupal::logger('ufc')->notice("Starting to update $div"); foreach ($fighters as $fighter_data) { $fighter = new Fighter(\Drupal::httpClient()); $fighter->first_name = $fighter_data['firstname']; $fighter->last_name = $fighter_data['lastname']; $fighter->image = $fighter_data['image']; $fighter->class = $div; if (!$fighter->scrapeDataFromFighterPage($fighter_data['profile'])) { \Drupal::logger('ufc')->alert("FAILED: $fighter->first_name $fighter->last_name to " . $fighter_data['profile']); } // Check if node exists, by title. $fighter->createMediaEntityFromImage(); $title = $fighter->first_name . " " . $fighter->last_name; $node_lookup = reset(\Drupal::entityTypeManager()->getStorage('node')->loadByProperties(['title' => $title])); if (!empty($node_lookup)) { // Update instead of create. $fighter->updatePlayerNode($node_lookup->id()); \Drupal::logger('ufc')->notice("$title updated successfully."); } else { \Drupal::logger('ufc')->warning("No existing player found for $title...creating"); $fighter->createPlayerNode(); } } $context['results']['processed']++; } /** * Get list of current fighters. * * @return array $fighters */ public function getListOfCurrentFighters(): array { foreach ($this->divisions as $division => $div_base_url) { $division_url = self::UFC_BASE . $div_base_url; $this->weightClass = $division; echo "Starting import for " . $division . "\n"; self::loopThroughFighterPages($division_url); } return $this->fighters; } /** * There is a pager, loop through to get all fighters. * * @param string $base_url */ public function loopThroughFighterPages($base_url): void { // Implement caching to store instead of needing fresh requests. for ($i=0; $i<=100; $i++) { $ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15'; $headers = [ 'referer' => true, 'verify' => false, 'headers' => [ 'User-Agent' => $ua, 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding' => 'gzip, deflate, br', ] ]; $url = $base_url . "&page=$i"; /* $cid = "ufc:" . $url; */ $request = $this->httpClient->request('GET', $url, $headers); $content = $request->getBody()->getContents(); $invalid_page = strpos($content, "No Result Found For"); if (!$invalid_page) { \Drupal::logger('ufc')->notice("Extracting fighters from page $i."); self::extractFighters($content); continue; } break; } } public function quickTest($str) { return "hello$str"; } /** * Extract fighters from an html string. */ public function extractFighters(string $input): void { $fighter_list = []; $crawler = new Crawler($input); $athlete_flipcards = $crawler->filter('.c-listing-athlete-flipcard'); $fighter_names = $athlete_flipcards->each(function (Crawler $crawler, $i) { return $crawler->filter('.c-listing-athlete__name')->text(); }); $fighter_profile_urls = $athlete_flipcards->each(function (Crawler $crawler, $i) { return $crawler->filter('.e-button--black')->attr('href'); }); $fighter_images = $athlete_flipcards->each(function (Crawler $crawler, $i) { $imgs = $crawler->filter('img')->each(function ($i) { return $i->attr('src'); }); return $imgs; }); $count_fighter_names = count($fighter_names); $count_profile_urls = count($fighter_profile_urls); $count_images = count($fighter_images); // Make sure the arrays are all the same size. assert(( ($count_fighter_names == $count_profile_urls) && ($count_profile_urls == $count_images) )); foreach ($fighter_names as $key => $fighter) { $fighter_names[$key] = [ 'name' => $fighter, 'profile' => $fighter_profile_urls[$key], 'images' => $fighter_images[$key], ]; } foreach ($fighter_names as $fighter_data) { $name_no_spaces = str_replace(" ", "", $fighter_data['name']); $weight_class_exists = array_key_exists($this->weightClass, $this->fighters); $fighter_exists = NULL; if ($weight_class_exists) { $fighter_exists = array_key_exists($name_no_spaces, $this->fighters[$this->weightClass]); } if (!$fighter_exists) { $split_name = explode(" ", $fighter_data['name'], 2); $this->fighters[$this->weightClass][$name_no_spaces]['firstname'] = $split_name[0] ?? " "; $this->fighters[$this->weightClass][$name_no_spaces]['lastname'] = $split_name[1] ?? " "; $this->fighters[$this->weightClass][$name_no_spaces]['profile'] = $fighter_data['profile']; } // here you are dan, set the profile url of the fighter, then use that in parsing later on if (!empty($fighter_data['images']) && count($fighter_data['images']) == 2) { $this->fighters[$this->weightClass][$name_no_spaces]['image'] = $fighter_data['images'][0]; } else { $this->fighters[$this->weightClass][$name_no_spaces]['image'] = FALSE; } } } }