代码拉取完成,页面将自动刷新
<!DOCTYPE html>
<html class="theme-next pisces use-motion" lang="zh-CN">
<head><meta name="generator" content="Hexo 3.8.0">
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Monda:300,300italic,400,400italic,700,700italic|Roboto Slab:300,300italic,400,400italic,700,700italic|Lobster Two:300,300italic,400,400italic,700,700italic|PT Mono:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext">
<link rel="stylesheet" href="/lib/font-awesome/css/font-awesome.min.css?v=4.7.0">
<link rel="stylesheet" href="/css/main.css?v=7.1.2">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=7.1.2">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=7.1.2">
<link rel="icon" type="image/png" sizes="16x16" href="/favicon.ico?v=7.1.2">
<link rel="mask-icon" href="/images/logo.svg?v=7.1.2" color="#222">
<script id="hexo.configurations">
var NexT = window.NexT || {};
var CONFIG = {
root: '/',
scheme: 'Pisces',
version: '7.1.2',
sidebar: {"position":"left","display":"hide","offset":12,"onmobile":false,"dimmer":false},
back2top: true,
back2top_sidebar: false,
fancybox: false,
fastclick: false,
lazyload: false,
tabs: true,
motion: {"enable":true,"async":true,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
algolia: {
applicationID: '',
apiKey: '',
indexName: '',
hits: {"per_page":10},
labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
}
};
</script>
<meta name="description" content="Spark简介">
<meta name="keywords" content="spark">
<meta property="og:type" content="article">
<meta property="og:title" content="Spark项目落地实战以及日常大数据开发注意事项">
<meta property="og:url" content="https://www.dudefu.tk/Spark项目落地实战以及日常大数据开发注意事项.html">
<meta property="og:site_name" content="The Future">
<meta property="og:description" content="Spark简介">
<meta property="og:locale" content="zh-CN">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7g9ip7j30fa0770tg.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7iy2qnj30fa08nju4.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7lcf7aj30fa03sgmk.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7p5ynzj30fa078glw.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7scd41j30fa07640c.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7vircej30fa09a750.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7ygkecj30fa09sta6.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks81s5qnj30fa05574q.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks842uowj30fa07bq5d.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks86w7u9j30fa08cac6.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks89mfdaj30fa08k40z.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8dail0j30fa0csjvb.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8g81p6j30fa0chq75.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8j994gj30fa07rta1.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8lykeaj30fa0enwkb.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8p5bh0j30fa0cc78s.jpg">
<meta property="og:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8sw7yuj30fa0bc456.jpg">
<meta property="og:updated_time" content="2021-01-12T04:08:55.595Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Spark项目落地实战以及日常大数据开发注意事项">
<meta name="twitter:description" content="Spark简介">
<meta name="twitter:image" content="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7g9ip7j30fa0770tg.jpg">
<link rel="canonical" href="https://www.dudefu.tk/Spark项目落地实战以及日常大数据开发注意事项">
<script id="page.configurations">
CONFIG.page = {
sidebar: "",
};
</script>
<title>Spark项目落地实战以及日常大数据开发注意事项 | The Future</title>
<noscript>
<style>
.use-motion .motion-element,
.use-motion .brand,
.use-motion .menu-item,
.sidebar-inner,
.use-motion .post-block,
.use-motion .pagination,
.use-motion .comments,
.use-motion .post-header,
.use-motion .post-body,
.use-motion .collection-title { opacity: initial; }
.use-motion .logo,
.use-motion .site-title,
.use-motion .site-subtitle {
opacity: initial;
top: initial;
}
.use-motion .logo-line-before i { left: initial; }
.use-motion .logo-line-after i { right: initial; }
</style>
</noscript>
</head>
<body itemscope="" itemtype="http://schema.org/WebPage" lang="zh-CN">
<div class="container sidebar-position-left page-post-detail">
<div class="headband"></div>
<header id="header" class="header" itemscope="" itemtype="http://schema.org/WPHeader">
<div class="header-inner"><div class="site-brand-wrapper">
<div class="site-meta">
<div class="custom-logo-site-title">
<a href="/" class="brand" rel="start">
<span class="logo-line-before"><i></i></span>
<span class="site-title">The Future</span>
<span class="logo-line-after"><i></i></span>
</a>
</div>
<h1 class="site-subtitle" itemprop="description">Stay hungry,stay foolish.</h1>
</div>
<div class="site-nav-toggle">
<button aria-label="切换导航栏">
<span class="btn-bar"></span>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
</button>
</div>
</div>
<nav class="site-nav">
<ul id="menu" class="menu">
<li class="menu-item menu-item-home">
<a href="/" rel="section"><i class="menu-item-icon fa fa-fw fa-home"></i> <br>首页</a>
</li>
<li class="menu-item menu-item-archives">
<a href="/archives/" rel="section"><i class="menu-item-icon fa fa-fw fa-archive"></i> <br>归档<span class="badge">125</span></a>
</li>
<li class="menu-item menu-item-categories">
<a href="/categories" rel="section"><i class="menu-item-icon fa fa-fw fa-th"></i> <br>分类<span class="badge">15</span></a>
</li>
<li class="menu-item menu-item-tags">
<a href="/tags" rel="section"><i class="menu-item-icon fa fa-fw fa-tags"></i> <br>标签<span class="badge">63</span></a>
</li>
<li class="menu-item menu-item-something">
<a href="/something" rel="section"><i class="menu-item-icon fa fa-fw fa-paper-plane"></i> <br>干货</a>
</li>
<li class="menu-item menu-item-about">
<a href="/about/" rel="section"><i class="menu-item-icon fa fa-fw fa-user"></i> <br>关于</a>
</li>
<li class="menu-item menu-item-search">
<a href="javascript:;" class="popup-trigger">
<i class="menu-item-icon fa fa-search fa-fw"></i> <br>搜索</a>
</li>
</ul>
<div class="site-search">
<div class="popup search-popup local-search-popup">
<div class="local-search-header clearfix">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<span class="popup-btn-close">
<i class="fa fa-times-circle"></i>
</span>
<div class="local-search-input-wrapper">
<input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
</div>
</div>
<div id="local-search-result"></div>
</div>
</div>
</nav>
</div>
</header>
<main id="main" class="main">
<div class="main-inner">
<div class="content-wrap">
<div id="content" class="content">
<div id="posts" class="posts-expand">
<div class="reading-progress-bar"></div>
<article class="post post-type-normal" itemscope="" itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="https://www.dudefu.tk/Spark项目落地实战以及日常大数据开发注意事项.html">
<span hidden itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<meta itemprop="name" content="Daniel X">
<meta itemprop="description" content="專注于大数据技術,分享干货">
<meta itemprop="image" content="https://hexoblog-1254111960.cos.ap-guangzhou.myqcloud.com/HexoBlog-tou.jpg">
</span>
<span hidden itemprop="publisher" itemscope="" itemtype="http://schema.org/Organization">
<meta itemprop="name" content="The Future">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">Spark项目落地实战以及日常大数据开发注意事项
</h2>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2021-01-12 12:06:11 / 修改时间:12:08:55" itemprop="dateCreated datePublished" datetime="2021-01-12T12:06:11+08:00">2021-01-12</time>
</span>
<span class="post-category">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope="" itemtype="http://schema.org/Thing"><a href="/categories/大数据/" itemprop="url" rel="index"><span itemprop="name">大数据</span></a></span>
</span>
<span class="post-comments-count">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-comment-o"></i>
</span>
<span class="post-meta-item-text">评论数:</span>
<a href="/Spark项目落地实战以及日常大数据开发注意事项.html#comments" itemprop="discussionUrl">
<span class="post-comments-count valine-comment-count" data-xid="/Spark项目落地实战以及日常大数据开发注意事项.html" itemprop="commentCount"></span>
</a>
</span>
<span id="/Spark项目落地实战以及日常大数据开发注意事项.html" class="leancloud_visitors" data-flag-title="Spark项目落地实战以及日常大数据开发注意事项">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-eye"></i>
</span>
<span class="post-meta-item-text">阅读次数:</span>
<span class="leancloud-visitors-count"></span>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<p>Spark简介 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7g9ip7j30fa0770tg.jpg" alt="img"></p>
<a id="more"></a>
<p>基于内存的分布式集群计算平台</p>
<p>可适配 Python、Java、Scala、SQL</p>
<p>拓展功能:机器学习、流式计算、图计算</p>
<p>Spark特点 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7iy2qnj30fa08nju4.jpg" alt="img"></p>
<p>高效 </p>
<ul>
<li>内存计算引擎</li>
<li>DAG图</li>
<li>比MapReduce快10~100倍</li>
</ul>
<h4 id="易用"><a href="#易用" class="headerlink" title="易用"></a>易用</h4><ul>
<li>提供丰富的API,支持Java,Scala, Python</li>
<li>代码量小</li>
</ul>
<h4 id="与Hadoop集成"><a href="#与Hadoop集成" class="headerlink" title="与Hadoop集成"></a>与Hadoop集成</h4><ul>
<li>读写HDFS、Hbase、Hive</li>
<li>和Yarn集成</li>
</ul>
<p>与Oracle存过的对比 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7lcf7aj30fa03sgmk.jpg" alt="img"></p>
<p>Spark应用场景 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7p5ynzj30fa078glw.jpg" alt="img"></p>
<ul>
<li>数据仓库</li>
<li>机器学习</li>
<li>海量数据离线分析</li>
<li>实时数据流处理</li>
</ul>
<p>基本概念 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7scd41j30fa07640c.jpg" alt="img"></p>
<p>集群架构 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7vircej30fa09a750.jpg" alt="img"></p>
<ul>
<li>集群资源管理器(Cluster Manager)</li>
<li>运行作业任务的工作节点(Worker Node)</li>
<li>每个应用的任务控制节点(Driver)</li>
<li>每个工作节点上负责具体任务的执行进程 (Executor)</li>
<li>资源管理器Mesos或YARN</li>
</ul>
<p>任务执行流程 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks7ygkecj30fa09sta6.jpg" alt="img"></p>
<ol>
<li>首先为应用构建起基本的运行环境,即由 Driver创建一个SparkContext,进行资源 的申请、任务的分配和监控</li>
<li>资源管理器为Executor分配资源,并启动 Executor进程</li>
<li>SparkContext根据RDD的依赖关系构建 DAG图,DAG图提交给DAGScheduler解 析成Stage,然后把一个个TaskSet提交给 底层调度器TaskScheduler处理; Executor向SparkContext申请Task,Task Scheduler将Task发放给Executor运行, 并提供应用程序代码</li>
<li>Task在Executor上运行,把执行结果反馈 给TaskScheduler,然后反馈给 DAGScheduler,运行完毕后写入数据并 释放所有资源。</li>
</ol>
<p>数据处理过程 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks81s5qnj30fa05574q.jpg" alt="img"></p>
<ol>
<li>读入外部数据源</li>
<li>转换算子进行数据处理</li>
<li>动作算子进行处理流程触发</li>
<li>处理完成输出结果</li>
</ol>
<p>常用算子-转换 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks842uowj30fa07bq5d.jpg" alt="img"></p>
<p>开发案例–集团电信三码低效资产分析 </p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks86w7u9j30fa08cac6.jpg" alt="img"></p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks89mfdaj30fa08k40z.jpg" alt="img"></p>
<p>Spark很香、也很坑</p>
<p>坑1:无法自定义自增序列</p>
<p>坑2:Spark Stage之间的血缘冗长</p>
<p>坑3:直连Oracle读取慢</p>
<p>坑4:时间格式支持不友好</p>
<p>常见问题1-无法自定义自增序列 </p>
<p>问题阐述:</p>
<p>在不同的业务逻辑中,由于会存在多种维度的分析,但是他们的结果是写入到同一张表格中的。在oracle中执行的时候是根据oracle中定义的序列来保证ID的唯一性,但是 我们代码实现的时候采用的数据加载模式时无法加载oracle中的序列,并且读取序列也会收到oracle序列缓冲的影响。所以在业务逻辑处理上我们得自己定义一个属于我们 业务的ID序列,并且需要保证唯一性。</p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8dail0j30fa0csjvb.jpg" alt="img"></p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8g81p6j30fa0chq75.jpg" alt="img"></p>
<p>常见问题2-血缘关系冗长 </p>
<p>问题阐述:</p>
<p>由于SparkSQL在解析成ATS树时会向上追溯血缘并重复解析,且随着血缘关系的增长ATS树会变的越来越复杂,导致任务执行效率会严重降低。具体表象为 Spark任务在执行过程中会卡住不动,程序继续卡顿几个小时之后才会开始继续运行。</p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8j994gj30fa07rta1.jpg" alt="img"></p>
<p>方案一:checkpoints方式切割方式</p>
<p>方案二:hdfs落地,使用时二次读取</p>
<p>常见问题3-读Oracle速率慢 </p>
<p>问题阐述:</p>
<p>在读取Oracle时,数据表未做分区,程序无法通过指定分区并行加载数据,且为了减小数据库IO压力,采用限制高频、数据读取限制等策略,导致读取Oracle速 率很慢,影响计算效率。</p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8lykeaj30fa0enwkb.jpg" alt="img"></p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8p5bh0j30fa0cc78s.jpg" alt="img"></p>
<p>常见问题4-Oracle时间格式支持不友好 </p>
<p>问题阐述:</p>
<p>park在读写Oracle时date类型数据容易丢失精度,例如: Oracle中 2019-12-20 05:44:30读取后为2019-12-20, Spark中2019-12-20 05:44:30写入后变成2019-12-20 00:00:00</p>
<p><img src="https://tva1.sinaimg.cn/large/008eGmZEgy1gmks8sw7yuj30fa0bc456.jpg" alt="img"></p>
<p>解决方案:</p>
<p>Oracle方言,即自定义一种数据库解释语言,实际上的实现 为数据的类型转换。OracleDateTypeInit.oracleInit()</p>
</div>
<footer class="post-footer">
<div class="post-tags">
<a href="/tags/spark/" rel="tag"><i class="fa fa-tag"></i> spark</a>
</div>
<div class="post-widgets">
<div class="social_share">
<div>
<script src="//cdn.jsdelivr.net/npm/ilyabirman-likely@2/release/likely.js"></script>
<link rel="stylesheet" href="//cdn.jsdelivr.net/npm/ilyabirman-likely@2/release/likely.css">
<div class="likely likely-light">
<div class="twitter">Tweet</div>
<div class="facebook">Share</div>
<div class="linkedin">Link</div>
<div class="gplus">Plus</div>
<div class="vkontakte">Share</div>
<div class="odnoklassniki">Class</div>
<div class="telegram">Send</div>
<div class="whatsapp">Send</div>
<div class="pinterest">Pin</div>
</div>
</div>
<div>
<div class="bdsharebuttonbox">
<a href="#" class="bds_tsina" data-cmd="tsina" title="分享到新浪微博"></a>
<a href="#" class="bds_douban" data-cmd="douban" title="分享到豆瓣网"></a>
<a href="#" class="bds_sqq" data-cmd="sqq" title="分享到QQ好友"></a>
<a href="#" class="bds_qzone" data-cmd="qzone" title="分享到QQ空间"></a>
<a href="#" class="bds_weixin" data-cmd="weixin" title="分享到微信"></a>
<a href="#" class="bds_tieba" data-cmd="tieba" title="分享到百度贴吧"></a>
<a href="#" class="bds_twi" data-cmd="twi" title="分享到Twitter"></a>
<a href="#" class="bds_fbook" data-cmd="fbook" title="分享到Facebook"></a>
<a href="#" class="bds_more" data-cmd="more"></a>
<a class="bds_count" data-cmd="count"></a>
</div>
<script>
window._bd_share_config = {
"common": {
"bdText": "",
"bdMini": "2",
"bdMiniList": false,
"bdPic": ""
},
"share": {
"bdSize": "16",
"bdStyle": "0"
},
"image": {
"viewList": ["tsina", "douban", "sqq", "qzone", "weixin", "twi", "fbook"],
"viewText": "分享到:",
"viewSize": "16"
}
}
</script>
<script>
with(document)0[(getElementsByTagName('head')[0]||body).appendChild(createElement('script')).src='//bdimg.share.baidu.com/static/api/js/share.js?cdnversion='+~(-new Date()/36e5)];
</script>
</div>
</div>
</div>
<div class="post-nav">
<div class="post-nav-next post-nav-item">
<a href="/Elasticsearch查询速度为什么这么快?.html" rel="next" title="Elasticsearch查询速度为什么这么快?">
<i class="fa fa-chevron-left"></i> Elasticsearch查询速度为什么这么快?
</a>
</div>
<span class="post-nav-divider"></span>
<div class="post-nav-prev post-nav-item">
<a href="/基于云计算和大数据的模拟车辆行车监控系统.html" rel="prev" title="基于云计算和大数据的模拟车辆行车监控系统">
基于云计算和大数据的模拟车辆行车监控系统 <i class="fa fa-chevron-right"></i>
</a>
</div>
</div>
</footer>
</div>
</article>
</div>
</div>
<div class="comments" id="comments">
<div id="lv-container" data-id="city" data-uid="MTAyMC8yOTk3My82NTM4"></div>
</div>
</div>
<div class="sidebar-toggle">
<div class="sidebar-toggle-line-wrap">
<span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
</div>
</div>
<aside id="sidebar" class="sidebar">
<div class="sidebar-inner">
<ul class="sidebar-nav motion-element">
<li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
文章目录
</li>
<li class="sidebar-nav-overview" data-target="site-overview-wrap">
站点概览
</li>
</ul>
<div class="site-overview-wrap sidebar-panel">
<div class="site-overview">
<div class="site-author motion-element" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<img class="site-author-image" itemprop="image" src="https://hexoblog-1254111960.cos.ap-guangzhou.myqcloud.com/HexoBlog-tou.jpg" alt="Daniel X">
<p class="site-author-name" itemprop="name">Daniel X</p>
<div class="site-description motion-element" itemprop="description">專注于大数据技術,分享干货</div>
</div>
<nav class="site-state motion-element">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">125</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories">
<span class="site-state-item-count">15</span>
<span class="site-state-item-name">分类</span>
</a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags">
<span class="site-state-item-count">63</span>
<span class="site-state-item-name">标签</span>
</a>
</div>
</nav>
<div class="links-of-author motion-element">
<span class="links-of-author-item">
<span class="exturl" data-url="aHR0cHM6Ly9naXRodWIuY29tL2R1ZGVmdQ==" title="GitHub → https://github.com/dudefu"><i class="fa fa-fw fa-github"></i>GitHub</span>
</span>
<span class="links-of-author-item">
<span class="exturl" data-url="bWFpbHRvOmR1ZGVmdUBmb3htYWlsLmNvbT9zdWJqZWN0PUhlbGxvJTIwYWdhaW4=" title="E-mail → mailto:dudefu@foxmail.com?subject=Hello%20again"><i class="fa fa-fw fa-envelope"></i>E-mail</span>
</span>
<span class="links-of-author-item">
<span class="exturl" data-url="aHR0cHM6Ly93ZWliby5jb20vZHVkZWZ1" title="Weibo → https://weibo.com/dudefu"><i class="fa fa-fw fa-weibo"></i>Weibo</span>
</span>
<span class="links-of-author-item">
<span class="exturl" data-url="aHR0cHM6Ly93cGEucXEuY29tL21zZ3JkP3Y9MyZ1aW49MTU3NzU3MTk1OSZzaXRlPWR1ZGVmdS5pbmZvJm1lbnU9eWVz" title="QQ → https://wpa.qq.com/msgrd?v=3&uin=1577571959&site=dudefu.info&menu=yes"><i class="fa fa-fw fa-qq"></i>QQ</span>
</span>
</div>
</div>
</div>
<!--noindex-->
<div class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
<div class="post-toc">
<div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-4"><a class="nav-link" href="#易用"><span class="nav-number">1.</span> <span class="nav-text">易用</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#与Hadoop集成"><span class="nav-number">2.</span> <span class="nav-text">与Hadoop集成</span></a></li></ol></div>
</div>
</div>
<!--/noindex-->
</div>
</aside>
</div>
</main>
<footer id="footer" class="footer">
<div class="footer-inner">
<div class="copyright"> <span class="exturl" data-url="aHR0cDovL3d3dy5iZWlhbi5taWl0Lmdvdi5jbg==">粤ICP备18110871号 </span>© 2017 – <span itemprop="copyrightYear">2021</span>
<span class="with-love" id="animate">
<i class="fa fa-spinner"></i>
</span>
<span class="author" itemprop="copyrightHolder">dudefu</span>
</div>
<!--
<div class="powered-by">由 <span class="exturl theme-link" data-url="aHR0cHM6Ly9oZXhvLmlv">Hexo</span> 强力驱动 v3.8.0</div>
<span class="post-meta-divider">|</span>
<div class="theme-info">主题 – <span class="exturl theme-link" data-url="aHR0cHM6Ly90aGVtZS1uZXh0Lm9yZw==">NexT.Pisces</span> v7.1.2</div>
-->
</div>
</footer>
<div class="back-to-top">
<i class="fa fa-arrow-up"></i>
<span id="scrollpercent"><span>0</span>%</span>
</div>
</div>
<script>
if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
window.Promise = null;
}
</script>
<script color="26,26,26" opacity="0.5" zindex="-1" count="99" src="//cdn.jsdelivr.net/gh/theme-next/theme-next-canvas-nest@1/canvas-nest.min.js"></script>
<script id="ribbon" size="300" alpha="0.6" zindex="-1" src="/lib/canvas-ribbon/canvas-ribbon.js"></script>
<script src="/lib/jquery/index.js?v=3.4.1"></script>
<script src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
<script src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
<script src="/lib/reading_progress/reading_progress.js"></script>
<script src="/js/utils.js?v=7.1.2"></script>
<script src="/js/motion.js?v=7.1.2"></script>
<script src="/js/affix.js?v=7.1.2"></script>
<script src="/js/schemes/pisces.js?v=7.1.2"></script>
<script src="/js/scrollspy.js?v=7.1.2"></script>
<script src="/js/post-details.js?v=7.1.2"></script>
<script src="/js/next-boot.js?v=7.1.2"></script>
<script src="/js/js.cookie.js?v=7.1.2"></script>
<script src="/js/scroll-cookie.js?v=7.1.2"></script>
<script src="/js/exturl.js?v=7.1.2"></script>
<script src="//cdn1.lncld.net/static/js/3.11.1/av-min.js"></script>
<script src="//unpkg.com/valine/dist/Valine.min.js"></script>
<script>
var GUEST = ['nick', 'mail', 'link'];
var guest = 'nick,mail,link';
guest = guest.split(',').filter(function(item) {
return GUEST.indexOf(item) > -1;
});
new Valine({
el: '#comments',
verify: true,
notify: true,
appId: '1N5rpk874DGudJw2wCL9J011-gzGzoHsz',
appKey: '9Y83e6suJgx567wtxhKy45IN',
placeholder: 'Just go go',
avatar: 'mm',
meta: guest,
pageSize: '10' || 10,
visitor: true,
lang: 'zk-cn' || 'zh-cn'
});
</script>
<script>
window.livereOptions = {
refer: 'Spark项目落地实战以及日常大数据开发注意事项.html'
};
(function(d, s) {
var j, e = d.getElementsByTagName(s)[0];
if (typeof LivereTower === 'function') { return; }
j = d.createElement(s);
j.src = 'https://cdn-city.livere.com/js/embed.dist.js';
j.async = true;
e.parentNode.insertBefore(j, e);
})(document, 'script');
</script>
<script>
// Popup Window;
var isfetched = false;
var isXml = true;
// Search DB path;
var search_path = "search.xml";
if (search_path.length === 0) {
search_path = "search.xml";
} else if (/json$/i.test(search_path)) {
isXml = false;
}
var path = "/" + search_path;
// monitor main search box;
var onPopupClose = function (e) {
$('.popup').hide();
$('#local-search-input').val('');
$('.search-result-list').remove();
$('#no-result').remove();
$(".local-search-pop-overlay").remove();
$('body').css('overflow', '');
}
function proceedsearch() {
$("body")
.append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
.css('overflow', 'hidden');
$('.search-popup-overlay').click(onPopupClose);
$('.popup').toggle();
var $localSearchInput = $('#local-search-input');
$localSearchInput.attr("autocapitalize", "none");
$localSearchInput.attr("autocorrect", "off");
$localSearchInput.focus();
}
// search function;
var searchFunc = function(path, search_id, content_id) {
'use strict';
// start loading animation
$("body")
.append('<div class="search-popup-overlay local-search-pop-overlay">' +
'<div id="search-loading-icon">' +
'<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
'</div>' +
'</div>')
.css('overflow', 'hidden');
$("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');
$.ajax({
url: path,
dataType: isXml ? "xml" : "json",
async: true,
success: function(res) {
// get the contents from search data
isfetched = true;
$('.popup').detach().appendTo('.header-inner');
var datas = isXml ? $("entry", res).map(function() {
return {
title: $("title", this).text(),
content: $("content",this).text(),
url: $("url" , this).text()
};
}).get() : res;
var input = document.getElementById(search_id);
var resultContent = document.getElementById(content_id);
var inputEventFunction = function() {
var searchText = input.value.trim().toLowerCase();
var keywords = searchText.split(/[\s\-]+/);
if (keywords.length > 1) {
keywords.push(searchText);
}
var resultItems = [];
if (searchText.length > 0) {
// perform local searching
datas.forEach(function(data) {
var isMatch = false;
var hitCount = 0;
var searchTextCount = 0;
var title = data.title.trim();
var titleInLowerCase = title.toLowerCase();
var content = data.content.trim().replace(/<[^>]+>/g,"");
var contentInLowerCase = content.toLowerCase();
var articleUrl = decodeURIComponent(data.url).replace(/\/{2,}/g, '/');
var indexOfTitle = [];
var indexOfContent = [];
// only match articles with not empty titles
if(title != '') {
keywords.forEach(function(keyword) {
function getIndexByWord(word, text, caseSensitive) {
var wordLen = word.length;
if (wordLen === 0) {
return [];
}
var startPosition = 0, position = [], index = [];
if (!caseSensitive) {
text = text.toLowerCase();
word = word.toLowerCase();
}
while ((position = text.indexOf(word, startPosition)) > -1) {
index.push({position: position, word: word});
startPosition = position + wordLen;
}
return index;
}
indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
});
if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
isMatch = true;
hitCount = indexOfTitle.length + indexOfContent.length;
}
}
// show search results
if (isMatch) {
// sort index by position of keyword
[indexOfTitle, indexOfContent].forEach(function (index) {
index.sort(function (itemLeft, itemRight) {
if (itemRight.position !== itemLeft.position) {
return itemRight.position - itemLeft.position;
} else {
return itemLeft.word.length - itemRight.word.length;
}
});
});
// merge hits into slices
function mergeIntoSlice(text, start, end, index) {
var item = index[index.length - 1];
var position = item.position;
var word = item.word;
var hits = [];
var searchTextCountInSlice = 0;
while (position + word.length <= end && index.length != 0) {
if (word === searchText) {
searchTextCountInSlice++;
}
hits.push({position: position, length: word.length});
var wordEnd = position + word.length;
// move to next position of hit
index.pop();
while (index.length != 0) {
item = index[index.length - 1];
position = item.position;
word = item.word;
if (wordEnd > position) {
index.pop();
} else {
break;
}
}
}
searchTextCount += searchTextCountInSlice;
return {
hits: hits,
start: start,
end: end,
searchTextCount: searchTextCountInSlice
};
}
var slicesOfTitle = [];
if (indexOfTitle.length != 0) {
slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
}
var slicesOfContent = [];
while (indexOfContent.length != 0) {
var item = indexOfContent[indexOfContent.length - 1];
var position = item.position;
var word = item.word;
// cut out 100 characters
var start = position - 20;
var end = position + 80;
if(start < 0){
start = 0;
}
if (end < position + word.length) {
end = position + word.length;
}
if(end > content.length){
end = content.length;
}
slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
}
// sort slices in content by search text's count and hits' count
slicesOfContent.sort(function (sliceLeft, sliceRight) {
if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
return sliceRight.searchTextCount - sliceLeft.searchTextCount;
} else if (sliceLeft.hits.length !== sliceRight.hits.length) {
return sliceRight.hits.length - sliceLeft.hits.length;
} else {
return sliceLeft.start - sliceRight.start;
}
});
// select top N slices in content
var upperBound = parseInt('1');
if (upperBound >= 0) {
slicesOfContent = slicesOfContent.slice(0, upperBound);
}
// highlight title and content
function highlightKeyword(text, slice) {
var result = '';
var prevEnd = slice.start;
slice.hits.forEach(function (hit) {
result += text.substring(prevEnd, hit.position);
var end = hit.position + hit.length;
result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
prevEnd = end;
});
result += text.substring(prevEnd, slice.end);
return result;
}
var resultItem = '';
if (slicesOfTitle.length != 0) {
resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
} else {
resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
}
slicesOfContent.forEach(function (slice) {
resultItem += "<a href='" + articleUrl + "'>" +
"<p class=\"search-result\">" + highlightKeyword(content, slice) +
"...</p>" + "</a>";
});
resultItem += "</li>";
resultItems.push({
item: resultItem,
searchTextCount: searchTextCount,
hitCount: hitCount,
id: resultItems.length
});
}
})
};
if (keywords.length === 1 && keywords[0] === "") {
resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x"></i></div>'
} else if (resultItems.length === 0) {
resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x"></i></div>'
} else {
resultItems.sort(function (resultLeft, resultRight) {
if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
return resultRight.searchTextCount - resultLeft.searchTextCount;
} else if (resultLeft.hitCount !== resultRight.hitCount) {
return resultRight.hitCount - resultLeft.hitCount;
} else {
return resultRight.id - resultLeft.id;
}
});
var searchResultList = '<ul class=\"search-result-list\">';
resultItems.forEach(function (result) {
searchResultList += result.item;
})
searchResultList += "</ul>";
resultContent.innerHTML = searchResultList;
}
}
if ('auto' === 'auto') {
input.addEventListener('input', inputEventFunction);
} else {
$('.search-icon').click(inputEventFunction);
input.addEventListener('keypress', function (event) {
if (event.keyCode === 13) {
inputEventFunction();
}
});
}
// remove loading animation
$(".local-search-pop-overlay").remove();
$('body').css('overflow', '');
proceedsearch();
}
});
}
// handle and trigger popup window;
$('.popup-trigger').click(function(e) {
e.stopPropagation();
if (isfetched === false) {
searchFunc(path, 'local-search-input', 'local-search-result');
} else {
proceedsearch();
};
});
$('.popup-btn-close').click(onPopupClose);
$('.popup').click(function(e){
e.stopPropagation();
});
$(document).on('keyup', function (event) {
var shouldDismissSearchPopup = event.which === 27 &&
$('.search-popup').is(':visible');
if (shouldDismissSearchPopup) {
onPopupClose();
}
});
</script>
<script src="https://www.gstatic.com/firebasejs/4.6.0/firebase.js"></script>
<script src="https://www.gstatic.com/firebasejs/4.6.0/firebase-firestore.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/bluebird/3.5.1/bluebird.core.min.js"></script>
<script>
(function () {
firebase.initializeApp({
apiKey: '',
projectId: ''
})
function getCount(doc, increaseCount) {
//increaseCount will be false when not in article page
return doc.get().then(function (d) {
var count
if (!d.exists) { //has no data, initialize count
if (increaseCount) {
doc.set({
count: 1
})
count = 1
}
else {
count = 0
}
}
else { //has data
count = d.data().count
if (increaseCount) {
if (!(window.localStorage && window.localStorage.getItem(title))) { //if first view this article
doc.set({ //increase count
count: count + 1
})
count++
}
}
}
if (window.localStorage && increaseCount) { //mark as visited
localStorage.setItem(title, true)
}
return count
})
}
function appendCountTo(el) {
return function (count) {
$(el).append(
$('<span>').addClass('post-visitors-count').append(
$('<span>').addClass('post-meta-divider').text('|')
).append(
$('<span>').addClass('post-meta-item-icon').append(
$('<i>').addClass('fa fa-users')
)
).append($('<span>').text('阅读次数 ' + count))
)
}
}
var db = firebase.firestore()
var articles = db.collection('articles')
//https://hexo.io/docs/variables.html
var isPost = 'Spark项目落地实战以及日常大数据开发注意事项'.length > 0
var isArchive = '' === 'true'
var isCategory = ''.length > 0
var isTag = ''.length > 0
if (isPost) { //is article page
var title = 'Spark项目落地实战以及日常大数据开发注意事项'
var doc = articles.doc(title)
getCount(doc, true).then(appendCountTo($('.post-meta')))
}
else if (!isArchive && !isCategory && !isTag) { //is index page
var titles = [] //array to titles
var postsstr = '' //if you have a better way to get titles of posts, please change it
eval(postsstr)
var promises = titles.map(function (title) {
return articles.doc(title)
}).map(function (doc) {
return getCount(doc)
})
Promise.all(promises).then(function (counts) {
var metas = $('.post-meta')
counts.forEach(function (val, idx) {
appendCountTo(metas[idx])(val)
})
})
}
})()
</script>
<script>
if ($('body').find('div.pdf').length) {
$.ajax({
type: 'GET',
url: '//cdn.jsdelivr.net/npm/pdfobject@2/pdfobject.min.js',
dataType: 'script',
cache: true,
success: function() {
$('body').find('div.pdf').each(function(i, o) {
PDFObject.embed($(o).attr('target'), $(o), {
pdfOpenParams: {
navpanes: 0,
toolbar: 0,
statusbar: 0,
pagemode: 'thumbs',
view: 'FitH'
},
PDFJS_URL: '/lib/pdf/web/viewer.html',
height: $(o).attr('height') || '500px'
});
});
},
});
}
</script>
<script>
if ($('body').find('pre.mermaid').length) {
$.ajax({
type: 'GET',
url: '//cdn.jsdelivr.net/npm/mermaid@8/dist/mermaid.min.js',
dataType: 'script',
cache: true,
success: function() {
mermaid.initialize({
theme: 'dark',
logLevel: 3,
flowchart: { curve: 'linear' },
gantt: { axisFormat: '%m/%d/%Y' },
sequence: { actorMargin: 50 }
});
}
});
}
</script>
<script>
(function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
bp.src = (curProtocol === 'https') ? 'https://zz.bdstatic.com/linksubmit/push.js' : 'http://push.zhanzhang.baidu.com/push.js';
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script>
<script src="/lib/bookmark/bookmark.min.js?v=1.0"></script>
<script>
bookmark.scrollToMark('auto', "#更多");
</script>
<script>
$('.highlight').not('.gist .highlight').each(function(i, e) {
var $wrap = $('<div>').addClass('highlight-wrap');
$(e).after($wrap);
$wrap.append($('<button>').addClass('copy-btn').append('复制').on('click', function(e) {
var code = $(this).parent().find('.code').find('.line').map(function(i, e) {
return $(e).text();
}).toArray().join('\n');
var ta = document.createElement('textarea');
var yPosition = window.pageYOffset || document.documentElement.scrollTop;
ta.style.top = yPosition + 'px'; // Prevent page scroll
ta.style.position = 'absolute';
ta.style.opacity = '0';
ta.readOnly = true;
ta.value = code;
document.body.appendChild(ta);
const selection = document.getSelection();
const selected = selection.rangeCount > 0 ? selection.getRangeAt(0) : false;
ta.select();
ta.setSelectionRange(0, code.length);
ta.readOnly = false;
var result = document.execCommand('copy');
if (result) $(this).text('复制成功');
else $(this).text('复制失败');
ta.blur(); // For iOS
$(this).blur();
if (selected) {
selection.removeAllRanges();
selection.addRange(selected);
}
})).on('mouseleave', function(e) {
var $b = $(this).find('.copy-btn');
setTimeout(function() {
$b.text('复制');
}, 300);
}).append(e);
})
</script>
</body>
</html>
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。