account_circle_poc.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. # -*- coding: utf-8 -*-
  2. """
  3. # description:懂车帝车友圈POC
  4. """
  5. import time
  6. from collections import OrderedDict
  7. from datetime import datetime, timedelta
  8. from enum import Enum
  9. import uiautomator2
  10. from flask import jsonify
  11. from tools import loggerKit
  12. class account_circle_poc(Enum):
  13. R7 = "R7"
  14. F7 = "F7"
  15. def name(self):
  16. return "飞凡R7" if self == account_circle_poc.R7 else "飞凡F7"
  17. # 记录当前正在抓取的一级和二级tab
  18. current_primary_tab = None
  19. current_secondary_tab = None
  20. def crawl_data(device_id, car_type: account_circle_poc, task_transfer_time, sequence):
  21. data = __get_circle_data(device_id, car_type.name(), task_transfer_time, sequence)
  22. date = datetime.now().strftime('%y-%m-%d')
  23. loggerKit.info("data:{0}, date:{1}", jsonify(data), str(date))
  24. def __get_circle_data(device_id, car_name, task_transfer_time, sequence):
  25. """
  26. 抓取懂车帝车友圈数据
  27. :param device_id: 执行任务的设备
  28. :param car_name: 车友圈对应的汽车名称
  29. :param task_transfer_time: 数据截止时间
  30. """
  31. d = uiautomator2.connect(device_id)
  32. d.debug = False
  33. d.screen_on()
  34. d.unlock()
  35. d.press("recent")
  36. # 打开懂车帝app
  37. d.app_stop("com.ss.android.auto")
  38. d.app_start("com.ss.android.auto", ".policy.AutoPrivacyActivity")
  39. time.sleep(5)
  40. # app升级提醒弹窗处理
  41. if d(resourceId="com.ss.android.auto:id/dtq").wait(1):
  42. # 忽略升级
  43. d(resourceId="com.ss.android.auto:id/dtq").click()
  44. # 车友圈
  45. d.xpath(
  46. '//*[@resource-id="android:id/tabs"]/android.widget.RelativeLayout[2]/android.widget.LinearLayout[1]').click()
  47. time.sleep(2)
  48. # 搜索飞凡R7兴趣圈
  49. d(resourceId="com.ss.android.auto:id/ipm").click()
  50. d(resourceId="com.ss.android.auto:id/gt8").set_text(car_name)
  51. d(resourceId="com.ss.android.auto:id/fyw").click()
  52. time.sleep(2)
  53. d(text="车友圈").click()
  54. d(text="进入车友圈").click()
  55. #
  56. tabs = [
  57. # 动态
  58. (d.xpath(
  59. '//*[@resource-id="com.ss.android.auto:id/ce8"]/android.widget.LinearLayout[1]/android.widget.ImageView[1]'),
  60. "动态", "dongtai"),
  61. # 问答
  62. (d.xpath(
  63. '//*[@resource-id="com.ss.android.auto:id/ce8"]/android.widget.LinearLayout[2]/android.widget.ImageView[1]'),
  64. "问答", "answer"),
  65. # 口碑
  66. (d.xpath(
  67. '//*[@resource-id="com.ss.android.auto:id/ce8"]/android.widget.LinearLayout[3]/android.widget.ImageView[1]'),
  68. "口碑", "opinions"),
  69. ]
  70. data = dict()
  71. global current_primary_tab, current_secondary_tab
  72. for item in tabs:
  73. current_primary_tab = item[1]
  74. current_secondary_tab = None
  75. item[0].click()
  76. time.sleep(3)
  77. # 动态, 二级tab
  78. if item[1] == "动态":
  79. tmp_dict = dict()
  80. for sub_tab in (("热门", "hot"), ("全部", "all"), ("精华", "essence")):
  81. current_secondary_tab = sub_tab[0]
  82. d(resourceId="com.ss.android.auto:id/jy8", text=sub_tab[0]).click()
  83. time.sleep(3)
  84. if sub_tab[0] == "全部":
  85. # 按发布时间排序列表
  86. d(resourceId="com.ss.android.auto:id/jy8", text="全部").click()
  87. d(resourceId="com.ss.android.auto:id/io3", text="按发布时间排序").click()
  88. # 帖子列表
  89. recycler_view = d(resourceId="com.ss.android.auto:id/hh0",
  90. className="androidx.recyclerview.widget.RecyclerView")
  91. time.sleep(2)
  92. # 热门帖子固定抓取10条,根据任务排序上滑指定条数再抓取
  93. tmp_dict[sub_tab[1]] = __parse_dongchedi_circle_list(recycler_view, d, task_transfer_time,
  94. 10 if sub_tab[0] == "热门" else 0, sequence)
  95. time.sleep(1)
  96. #
  97. data[item[2]] = tmp_dict
  98. else:
  99. if item[1] == "问答":
  100. # 按发布时间排序列表
  101. d(resourceId="com.ss.android.auto:id/jy8", text="全部").click()
  102. d(resourceId="com.ss.android.auto:id/io3", text="按发布时间排序").click()
  103. # 帖子列表
  104. recycler_view = d(resourceId="com.ss.android.auto:id/hh0",
  105. className="androidx.recyclerview.widget.RecyclerView")
  106. time.sleep(2)
  107. data[item[2]] = __parse_dongchedi_circle_list(recycler_view, d, task_transfer_time, 0, sequence)
  108. time.sleep(1)
  109. #
  110. current_primary_tab = None
  111. current_secondary_tab = None
  112. #
  113. d.app_stop("com.ss.android.auto")
  114. return data
  115. def __parse_dongchedi_circle_list(listview, d: uiautomator2.Device, task_transfer_time, get_count, sequence):
  116. if not listview.exists:
  117. return []
  118. #
  119. transfer_date = datetime.strptime(task_transfer_time, '%Y-%m-%d').date()
  120. ret = OrderedDict()
  121. # get_count > 0 表示按固定数量抓取,按任务排序跳过前面的数据
  122. if not isinstance(sequence, int):
  123. sequence = 1
  124. skip_count = (sequence - 1) * get_count
  125. skip_cache = set()
  126. #
  127. while True:
  128. if d.xpath('//*[@text="没有更多了"]').exists:
  129. break
  130. cell = listview.child(resourceId="com.ss.android.auto:id/dvd", className="android.widget.LinearLayout")
  131. if not cell.exists:
  132. break
  133. # 帖子内容
  134. while not d(resourceId="com.ss.android.auto:id/jdr").wait(timeout=1):
  135. __retry(5, lambda: d.swipe_ext("up", 0.2))
  136. #
  137. content_view = d(resourceId="com.ss.android.auto:id/jdr")
  138. content = content_view.get_text()[:20]
  139. # 跳过指定数量的帖子
  140. if get_count > 0 and skip_count > 0:
  141. if content not in skip_cache:
  142. skip_count -= 1
  143. skip_cache.add(content)
  144. #
  145. __retry(5, lambda: d.swipe_ext("up", 0.2))
  146. continue
  147. #
  148. if content in skip_cache:
  149. __retry(5, lambda: d.swipe_ext("up", 0.2))
  150. continue
  151. # 根据帖子内容去重
  152. cache_key = content
  153. if cache_key in ret:
  154. # 内容已抓取
  155. __retry(5, lambda: d.swipe_ext("up", 0.2))
  156. continue
  157. # 发布时间
  158. time_view = __find_view(d, "down", lambda: content_view.down(resourceId="com.ss.android.auto:id/ihu"))
  159. if time_view is None:
  160. __retry(5, lambda: d.swipe_ext("up", 0.2))
  161. time.sleep(2)
  162. continue
  163. #
  164. time_str = time_view.get_text().split("最后回复")[0]
  165. time_str = __formate_time(time_str)
  166. #
  167. if get_count <= 0:
  168. # 不是按固定数量抓取时才比对发帖时间
  169. published_date = datetime.strptime(time_str, '%Y-%m-%d').date()
  170. if published_date > transfer_date:
  171. # 当前帖子时间晚于目标时间
  172. if current_primary_tab == "问答":
  173. # 列表内容可以漏抓,提高列表滑动效率
  174. if published_date - transfer_date > timedelta(days=2):
  175. __retry(5, lambda: d.swipe_ext("up", 1))
  176. elif published_date - transfer_date > timedelta(days=1):
  177. __retry(5, lambda: d.swipe_ext("up", 0.5))
  178. else:
  179. __retry(5, lambda: d.swipe_ext("up", 0.2))
  180. else:
  181. # 列表内容不能漏抓,不能滑动过快
  182. __retry(5, lambda: d.swipe_ext("up", 0.2))
  183. #
  184. continue
  185. elif published_date < transfer_date:
  186. # 当前帖子时间早于目标时间
  187. break
  188. # 昵称
  189. nickname = __find_view(d, "up", lambda: content_view.up(resourceId="com.ss.android.auto:id/v")).get_text()
  190. # 转发量
  191. share_count = __intValue(
  192. __find_view(d, "down", lambda: content_view.down(resourceId="com.ss.android.auto:id/jte")).get_text())
  193. # 评论数
  194. comment_count = __intValue(
  195. __find_view(d, "down", lambda: content_view.down(resourceId="com.ss.android.auto:id/i9j")).get_text())
  196. # 标题(标题有可能没有,所以这里不需要重试)
  197. title = ""
  198. title_view = content_view.up(resourceId="com.ss.android.auto:id/t", className="android.widget.TextView")
  199. if title_view is not None and title_view.get_text() != "车主评分":
  200. title = title_view.get_text()
  201. #
  202. item = {
  203. 'nickName': nickname,
  204. 'title': content if title == "" else title,
  205. 'publishDate': time_str,
  206. 'shareNum': share_count,
  207. 'commentNum': comment_count,
  208. }
  209. ret[cache_key] = item
  210. print(f"nickname = {nickname}, share_count = {share_count}, comment_count = {comment_count}, "
  211. f"content = {content}, title = {title}, time = {time_str}")
  212. #
  213. if 0 < get_count <= len(ret.values()):
  214. # 按固定数量抓取,已达预期
  215. break
  216. # 列表滚动有时会出错,需要加重试
  217. __retry(5, lambda: d.swipe_ext("up", 0.2))
  218. time.sleep(1)
  219. #
  220. return list(ret.values())
  221. def __retry(times, operation):
  222. sleep_time = 1
  223. while times > 0:
  224. try:
  225. operation()
  226. break
  227. except Exception as e:
  228. print("retry exception: \n")
  229. print(e)
  230. times -= 1
  231. time.sleep(sleep_time)
  232. sleep_time += 2
  233. def __intValue(value_str):
  234. value = 0
  235. try:
  236. value = int(value_str)
  237. except Exception as ex:
  238. pass
  239. return value
  240. def __find_view(d: uiautomator2.Device, direction, op):
  241. try:
  242. ret = op()
  243. retry_count = 10
  244. scroll_direction = ("up", "down")
  245. while ret is None and retry_count > 0 and direction in scroll_direction:
  246. retry_count -= 1
  247. __retry(5, lambda: d.swipe_ext("up" if direction == "down" else "down", 0.1))
  248. time.sleep(1)
  249. ret = op()
  250. #
  251. return ret
  252. except:
  253. return None
  254. def __formate_time(time_str):
  255. now = datetime.now()
  256. if '刚刚' in time_str:
  257. p_time = now
  258. elif '分钟' in time_str:
  259. minutes = int(time_str[:time_str.index('分')])
  260. p_time = now - timedelta(minutes=minutes)
  261. elif '小时' in time_str:
  262. hours = int(time_str[:time_str.index('小')])
  263. p_time = now - timedelta(hours=hours)
  264. elif '昨天' in time_str:
  265. p_time = now - timedelta(days=1)
  266. elif '前天' in time_str:
  267. p_time = now - timedelta(days=2)
  268. elif '天' in time_str:
  269. days = int(time_str[:time_str.index('天')])
  270. p_time = now - timedelta(days=days)
  271. elif '周' in time_str:
  272. weeks = int(time_str[:time_str.index('周') - 1])
  273. p_time = now - timedelta(weeks=weeks)
  274. elif '年' not in time_str and '月' in time_str and '日' in time_str:
  275. p_time = datetime.strptime(f'{now.year}年{time_str}', "%Y年%m月%d日").date()
  276. else:
  277. time_str = time_str.split()[0]
  278. items = time_str.split("-")
  279. if len(items) == 2:
  280. p_time = datetime.strptime(f'{now.year}年{items[0]}月{items[1]}日', "%Y年%m月%d日").date()
  281. elif len(items) == 3:
  282. p_time = datetime.strptime(f'{items[0]}年{items[1]}月{items[2]}日', "%Y年%m月%d日").date()
  283. else:
  284. return None
  285. #
  286. p_time = p_time.strftime('%Y-%m-%d')
  287. return p_time